summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2019-10-31 16:42:01 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2019-10-31 16:42:01 +1100
commitcbf83a411543bc3296abee873842ee8ff2a3e6b7 (patch)
treefc725a3d1dc0c6ba65fdf2b1092a96d43ecbb76b
parentd8d78c852ef4217d0fc3810614fa3c32b98ef9ed (diff)
parent080dba92bc46ec72ffa03ba2a00079c7cdcce509 (diff)
Merge branch 'akpm-current/current'
-rw-r--r--.gitattributes2
-rw-r--r--Documentation/admin-guide/sysctl/kernel.rst20
-rw-r--r--Documentation/memory-barriers.txt16
-rw-r--r--Documentation/process/deprecated.rst10
-rw-r--r--arch/arc/include/asm/pgtable.h1
-rw-r--r--arch/arc/mm/fault.c10
-rw-r--r--arch/arc/mm/highmem.c4
-rw-r--r--arch/arm64/mm/mmu.c4
-rw-r--r--arch/ia64/mm/init.c4
-rw-r--r--arch/powerpc/mm/mem.c3
-rw-r--r--arch/s390/mm/init.c4
-rw-r--r--arch/sh/mm/init.c4
-rw-r--r--arch/x86/mm/init_32.c4
-rw-r--r--arch/x86/mm/init_64.c4
-rw-r--r--drivers/base/memory.c7
-rw-r--r--drivers/base/node.c9
-rw-r--r--drivers/hv/hv_balloon.c4
-rw-r--r--drivers/xen/balloon.c1
-rw-r--r--fs/aio.c9
-rw-r--r--fs/binfmt_elf.c56
-rw-r--r--fs/buffer.c56
-rw-r--r--fs/eventpoll.c52
-rw-r--r--fs/hugetlbfs/inode.c4
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/file.c134
-rw-r--r--fs/proc/generic.c37
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/page.c40
-rw-r--r--fs/ramfs/inode.c12
-rw-r--r--fs/userfaultfd.c3
-rw-r--r--include/asm-generic/4level-fixup.h1
-rw-r--r--include/asm-generic/5level-fixup.h1
-rw-r--r--include/asm-generic/pgtable-nop4d.h2
-rw-r--r--include/asm-generic/pgtable-nopmd.h2
-rw-r--r--include/asm-generic/pgtable-nopud.h2
-rw-r--r--include/asm-generic/pgtable.h11
-rw-r--r--include/asm-generic/tlb.h4
-rw-r--r--include/linux/build_bug.h4
-rw-r--r--include/linux/gfp.h2
-rw-r--r--include/linux/hugetlb.h3
-rw-r--r--include/linux/memblock.h3
-rw-r--r--include/linux/memcontrol.h8
-rw-r--r--include/linux/memory_hotplug.h12
-rw-r--r--include/linux/memremap.h6
-rw-r--r--include/linux/mm.h19
-rw-r--r--include/linux/mmzone.h2
-rw-r--r--include/linux/notifier.h4
-rw-r--r--include/linux/page-flags.h54
-rw-r--r--include/linux/page-isolation.h4
-rw-r--r--include/linux/proc_fs.h4
-rw-r--r--include/linux/sched.h9
-rw-r--r--include/linux/sched/sysctl.h1
-rw-r--r--include/linux/slab.h20
-rw-r--r--include/linux/string.h45
-rw-r--r--include/linux/sysctl.h6
-rw-r--r--include/linux/thread_info.h2
-rw-r--r--include/linux/vmstat.h50
-rw-r--r--include/linux/wait.h4
-rw-r--r--include/trace/events/kmem.h21
-rw-r--r--include/trace/events/vmscan.h71
-rw-r--r--ipc/mqueue.c105
-rw-r--r--ipc/msg.c61
-rw-r--r--ipc/sem.c66
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/hung_task.c94
-rw-r--r--kernel/notifier.c41
-rw-r--r--kernel/profile.c6
-rw-r--r--kernel/sysctl.c8
-rw-r--r--lib/Kconfig.debug6
-rw-r--r--lib/math/rational.c63
-rw-r--r--lib/ubsan.c64
-rw-r--r--mm/cma.c6
-rw-r--r--mm/filemap.c21
-rw-r--r--mm/gup.c8
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/hugetlb.c284
-rw-r--r--mm/internal.h27
-rw-r--r--mm/khugepaged.c36
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memblock.c65
-rw-r--r--mm/memcontrol.c109
-rw-r--r--mm/memory-failure.c59
-rw-r--r--mm/memory.c46
-rw-r--r--mm/memory_hotplug.c174
-rw-r--r--mm/memremap.c36
-rw-r--r--mm/mmap.c85
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/oom_kill.c67
-rw-r--r--mm/page_alloc.c151
-rw-r--r--mm/page_io.c15
-rw-r--r--mm/page_isolation.c12
-rw-r--r--mm/pgtable-generic.c9
-rw-r--r--mm/rmap.c58
-rw-r--r--mm/shmem.c13
-rw-r--r--mm/slab.c7
-rw-r--r--mm/slab.h2
-rw-r--r--mm/slab_common.c99
-rw-r--r--mm/slub.c15
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c29
-rw-r--r--mm/swapfile.c7
-rw-r--r--mm/userfaultfd.c73
-rw-r--r--mm/util.c22
-rw-r--r--mm/vmalloc.c140
-rw-r--r--mm/vmscan.c44
-rw-r--r--mm/vmstat.c111
-rw-r--r--mm/z3fold.c357
-rw-r--r--net/sunrpc/rpc_pipe.c2
-rwxr-xr-xscripts/checkpatch.pl29
-rwxr-xr-xscripts/get_maintainer.pl38
-rw-r--r--tools/testing/selftests/Makefile1
-rw-r--r--tools/testing/selftests/filesystems/epoll/.gitignore1
-rw-r--r--tools/testing/selftests/filesystems/epoll/Makefile7
-rw-r--r--tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c3074
-rw-r--r--tools/testing/selftests/vm/config1
-rw-r--r--tools/testing/selftests/vm/gup_benchmark.c2
117 files changed, 5395 insertions, 1384 deletions
diff --git a/.gitattributes b/.gitattributes
index 89c411b5ce6b..4b32eaa9571e 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1,4 @@
*.c diff=cpp
*.h diff=cpp
+*.dtsi diff=dts
+*.dts diff=dts
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 6e0da29e55f1..614179dc79a9 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -45,6 +45,7 @@ show up in /proc/sys/kernel:
- hung_task_timeout_secs
- hung_task_check_interval_secs
- hung_task_warnings
+- hung_task_interval_warnings
- hyperv_record_panic_msg
- kexec_load_disabled
- kptr_restrict
@@ -383,14 +384,29 @@ Possible values to set are in range {0..LONG_MAX/HZ}.
hung_task_warnings:
===================
-The maximum number of warnings to report. During a check interval
-if a hung task is detected, this value is decreased by 1.
+The maximum number of warnings to report. If after timeout a hung
+task is present, this value is decreased by 1 every check interval,
+producing a warning.
When this value reaches 0, no more warnings will be reported.
This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
-1: report an infinite number of warnings.
+hung_task_interval_warnings:
+===================
+
+The same as hung_task_warnings, but set the number of interval
+warnings to be issued about detected hung tasks during check
+interval. That will produce warnings *before* the timeout happens.
+If a hung task is detected during check interval, this value is
+decreased by 1. When this value reaches 0, only timeout warnings
+will be reported.
+This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
+
+-1: report an infinite number of check interval warnings.
+
+
hyperv_record_panic_msg:
========================
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 1adbb8a371c7..fe43f4b30907 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1873,12 +1873,16 @@ There are some more advanced barrier functions:
(*) smp_mb__before_atomic();
(*) smp_mb__after_atomic();
- These are for use with atomic (such as add, subtract, increment and
- decrement) functions that don't return a value, especially when used for
- reference counting. These functions do not imply memory barriers.
-
- These are also used for atomic bitop functions that do not return a
- value (such as set_bit and clear_bit).
+ These are for use with atomic RMW functions that do not imply memory
+ barriers, but where the code needs a memory barrier. Examples for atomic
+ RMW functions that do not imply are memory barrier are e.g. add,
+ subtract, (failed) conditional operations, _relaxed functions,
+ but not atomic_read or atomic_set. A common example where a memory
+ barrier may be required is when atomic ops are used for reference
+ counting.
+
+ These are also used for atomic RMW bitop functions that do not imply a
+ memory barrier (such as set_bit and clear_bit).
As an example, consider a piece of code that marks an object as being dead
and then decrements the object's reference count:
diff --git a/Documentation/process/deprecated.rst b/Documentation/process/deprecated.rst
index 179f2a5625a0..a0ffdc8daef3 100644
--- a/Documentation/process/deprecated.rst
+++ b/Documentation/process/deprecated.rst
@@ -84,7 +84,7 @@ buffer. This could result in linear overflows beyond the
end of the buffer, leading to all kinds of misbehaviors. While
`CONFIG_FORTIFY_SOURCE=y` and various compiler flags help reduce the
risk of using this function, there is no good reason to add new uses of
-this function. The safe replacement is :c:func:`strscpy`.
+this function. The safe replacement is stracpy() or strscpy().
strncpy() on NUL-terminated strings
-----------------------------------
@@ -93,9 +93,9 @@ will be NUL terminated. This can lead to various linear read overflows
and other misbehavior due to the missing termination. It also NUL-pads the
destination buffer if the source contents are shorter than the destination
buffer size, which may be a needless performance penalty for callers using
-only NUL-terminated strings. The safe replacement is :c:func:`strscpy`.
-(Users of :c:func:`strscpy` still needing NUL-padding will need an
-explicit :c:func:`memset` added.)
+only NUL-terminated strings. In this case, the safe replacement is
+stracpy() or strscpy(). If, however, the destination buffer still needs
+NUL-padding, the safe replacement is stracpy_pad().
If a caller is using non-NUL-terminated strings, :c:func:`strncpy()` can
still be used, but destinations should be marked with the `__nonstring
@@ -107,7 +107,7 @@ strlcpy()
:c:func:`strlcpy` reads the entire source buffer first, possibly exceeding
the given limit of bytes to copy. This is inefficient and can lead to
linear read overflows if a source string is not NUL-terminated. The
-safe replacement is :c:func:`strscpy`.
+safe replacement is stracpy() or strscpy().
Variable Length Arrays (VLAs)
-----------------------------
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index ea14a8bfc691..9019ed9f9c94 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -33,7 +33,6 @@
#define _ASM_ARC_PGTABLE_H
#include <linux/bits.h>
-#define __ARCH_USE_5LEVEL_HACK
#include <asm-generic/pgtable-nopmd.h>
#include <asm/page.h>
#include <asm/mmu.h> /* to propagate CONFIG_ARC_MMU_VER <n> */
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index 3861543b66a0..fb86bc3e9b35 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -30,6 +30,7 @@ noinline static int handle_kernel_vaddr_fault(unsigned long address)
* with the 'reference' page table.
*/
pgd_t *pgd, *pgd_k;
+ p4d_t *p4d, *p4d_k;
pud_t *pud, *pud_k;
pmd_t *pmd, *pmd_k;
@@ -39,8 +40,13 @@ noinline static int handle_kernel_vaddr_fault(unsigned long address)
if (!pgd_present(*pgd_k))
goto bad_area;
- pud = pud_offset(pgd, address);
- pud_k = pud_offset(pgd_k, address);
+ p4d = p4d_offset(pgd, address);
+ p4d_k = p4d_offset(pgd_k, address);
+ if (!p4d_present(*p4d_k))
+ goto bad_area;
+
+ pud = pud_offset(p4d, address);
+ pud_k = pud_offset(p4d_k, address);
if (!pud_present(*pud_k))
goto bad_area;
diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c
index a4856bfaedf3..fc8849e4f72e 100644
--- a/arch/arc/mm/highmem.c
+++ b/arch/arc/mm/highmem.c
@@ -111,12 +111,14 @@ EXPORT_SYMBOL(__kunmap_atomic);
static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr)
{
pgd_t *pgd_k;
+ p4d_t *p4d_k;
pud_t *pud_k;
pmd_t *pmd_k;
pte_t *pte_k;
pgd_k = pgd_offset_k(kvaddr);
- pud_k = pud_offset(pgd_k, kvaddr);
+ p4d_k = p4d_offset(pgd_k, kvaddr);
+ pud_k = pud_offset(p4d_k, kvaddr);
pmd_k = pmd_offset(pud_k, kvaddr);
pte_k = (pte_t *)memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 60c929f3683b..d10247fab0fd 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1069,7 +1069,6 @@ void arch_remove_memory(int nid, u64 start, u64 size,
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- struct zone *zone;
/*
* FIXME: Cleanup page tables (also in arch_add_memory() in case
@@ -1078,7 +1077,6 @@ void arch_remove_memory(int nid, u64 start, u64 size,
* unplug. ARCH_ENABLE_MEMORY_HOTREMOVE must not be
* unlocked yet.
*/
- zone = page_zone(pfn_to_page(start_pfn));
- __remove_pages(zone, start_pfn, nr_pages, altmap);
+ __remove_pages(start_pfn, nr_pages, altmap);
}
#endif
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index bf9df2625bc8..a6dd80a2c939 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -689,9 +689,7 @@ void arch_remove_memory(int nid, u64 start, u64 size,
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- struct zone *zone;
- zone = page_zone(pfn_to_page(start_pfn));
- __remove_pages(zone, start_pfn, nr_pages, altmap);
+ __remove_pages(start_pfn, nr_pages, altmap);
}
#endif
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index be941d382c8d..97e5922cb52e 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -130,10 +130,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
int ret;
- __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
+ __remove_pages(start_pfn, nr_pages, altmap);
/* Remove htab bolted mappings for this section of memory */
start = (unsigned long)__va(start);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index a124f19f7b3c..c1d96e588152 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -291,10 +291,8 @@ void arch_remove_memory(int nid, u64 start, u64 size,
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- struct zone *zone;
- zone = page_zone(pfn_to_page(start_pfn));
- __remove_pages(zone, start_pfn, nr_pages, altmap);
+ __remove_pages(start_pfn, nr_pages, altmap);
vmem_remove_mapping(start, size);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index dfdbaa50946e..d1b1ff2be17a 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -434,9 +434,7 @@ void arch_remove_memory(int nid, u64 start, u64 size,
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
- struct zone *zone;
- zone = page_zone(pfn_to_page(start_pfn));
- __remove_pages(zone, start_pfn, nr_pages, altmap);
+ __remove_pages(start_pfn, nr_pages, altmap);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 930edeb41ec3..0a74407ef92e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -865,10 +865,8 @@ void arch_remove_memory(int nid, u64 start, u64 size,
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- struct zone *zone;
- zone = page_zone(pfn_to_page(start_pfn));
- __remove_pages(zone, start_pfn, nr_pages, altmap);
+ __remove_pages(start_pfn, nr_pages, altmap);
}
#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a6b5c653727b..b8541d77452c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1212,10 +1212,8 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
- struct zone *zone = page_zone(page);
- __remove_pages(zone, start_pfn, nr_pages, altmap);
+ __remove_pages(start_pfn, nr_pages, altmap);
kernel_physical_mapping_remove(start, start + size);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 55907c27075b..a757d9ed88a7 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -538,12 +538,7 @@ static ssize_t soft_offline_page_store(struct device *dev,
if (kstrtoull(buf, 0, &pfn) < 0)
return -EINVAL;
pfn >>= PAGE_SHIFT;
- if (!pfn_valid(pfn))
- return -ENXIO;
- /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
- if (!pfn_to_online_page(pfn))
- return -EIO;
- ret = soft_offline_page(pfn_to_page(pfn), 0);
+ ret = soft_offline_page(pfn, 0);
return ret == 0 ? count : ret;
}
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 296546ffed6c..98a31bafc8a2 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -496,20 +496,17 @@ static ssize_t node_read_vmstat(struct device *dev,
int n = 0;
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
+ n += sprintf(buf+n, "%s %lu\n", zone_stat_name(i),
sum_zone_node_page_state(nid, i));
#ifdef CONFIG_NUMA
for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
- n += sprintf(buf+n, "%s %lu\n",
- vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+ n += sprintf(buf+n, "%s %lu\n", numa_stat_name(i),
sum_zone_numa_state(nid, i));
#endif
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- n += sprintf(buf+n, "%s %lu\n",
- vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
- NR_VM_NUMA_STAT_ITEMS],
+ n += sprintf(buf+n, "%s %lu\n", node_stat_name(i),
node_page_state(pgdat, i));
return n;
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 34bd73526afd..da6ced15c6c5 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -680,9 +680,7 @@ static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg)
__ClearPageOffline(pg);
/* This frame is currently backed; online the page. */
- __online_page_set_limits(pg);
- __online_page_increment_counters(pg);
- __online_page_free(pg);
+ generic_online_page(pg, 0);
lockdep_assert_held(&dm_device.ha_lock);
dm_device.num_pages_onlined++;
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 5bae515c8e25..4f2e78a5e4db 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -374,7 +374,6 @@ static void xen_online_page(struct page *page, unsigned int order)
mutex_lock(&balloon_mutex);
for (i = 0; i < size; i++) {
p = pfn_to_page(start_pfn + i);
- __online_page_set_limits(p);
balloon_append(p);
}
mutex_unlock(&balloon_mutex);
diff --git a/fs/aio.c b/fs/aio.c
index 0d9a559d488c..1218a453d6db 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1287,12 +1287,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
* the ringbuffer empty. So in practice we should be ok, but it's
* something to be aware of when touching this code.
*/
- if (until == 0)
- aio_read_events(ctx, min_nr, nr, event, &ret);
- else
- wait_event_interruptible_hrtimeout(ctx->wait,
- aio_read_events(ctx, min_nr, nr, event, &ret),
- until);
+ wait_event_interruptible_hrtimeout(ctx->wait,
+ aio_read_events(ctx, min_nr, nr, event, &ret),
+ until);
return ret;
}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c5642bcb6b46..54f21926940e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -404,6 +404,17 @@ static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
ELF_PAGESTART(cmds[first_idx].p_vaddr);
}
+static int elf_read(struct file *file, void *buf, size_t len, loff_t pos)
+{
+ ssize_t rv;
+
+ rv = kernel_read(file, buf, len, &pos);
+ if (unlikely(rv != len)) {
+ return (rv < 0) ? rv : -EIO;
+ }
+ return 0;
+}
+
/**
* load_elf_phdrs() - load ELF program headers
* @elf_ex: ELF header of the binary whose program headers should be loaded
@@ -418,7 +429,6 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
{
struct elf_phdr *elf_phdata = NULL;
int retval, err = -1;
- loff_t pos = elf_ex->e_phoff;
unsigned int size;
/*
@@ -439,9 +449,9 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
goto out;
/* Read in the program headers */
- retval = kernel_read(elf_file, elf_phdata, size, &pos);
- if (retval != size) {
- err = (retval < 0) ? retval : -EIO;
+ retval = elf_read(elf_file, elf_phdata, size, elf_ex->e_phoff);
+ if (retval < 0) {
+ err = retval;
goto out;
}
@@ -544,7 +554,7 @@ static inline int make_prot(u32 p_flags)
an ELF header */
static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
- struct file *interpreter, unsigned long *interp_map_addr,
+ struct file *interpreter,
unsigned long no_base, struct elf_phdr *interp_elf_phdata)
{
struct elf_phdr *eppnt;
@@ -590,8 +600,6 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
map_addr = elf_map(interpreter, load_addr + vaddr,
eppnt, elf_prot, elf_type, total_size);
total_size = 0;
- if (!*interp_map_addr)
- *interp_map_addr = map_addr;
error = map_addr;
if (BAD_ADDR(map_addr))
goto out;
@@ -722,7 +730,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
elf_ppnt = elf_phdata;
for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
char *elf_interpreter;
- loff_t pos;
if (elf_ppnt->p_type != PT_INTERP)
continue;
@@ -740,14 +747,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
if (!elf_interpreter)
goto out_free_ph;
- pos = elf_ppnt->p_offset;
- retval = kernel_read(bprm->file, elf_interpreter,
- elf_ppnt->p_filesz, &pos);
- if (retval != elf_ppnt->p_filesz) {
- if (retval >= 0)
- retval = -EIO;
+ retval = elf_read(bprm->file, elf_interpreter, elf_ppnt->p_filesz,
+ elf_ppnt->p_offset);
+ if (retval < 0)
goto out_free_interp;
- }
/* make sure path is NULL terminated */
retval = -ENOEXEC;
if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
@@ -766,14 +769,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
would_dump(bprm, interpreter);
/* Get the exec headers */
- pos = 0;
- retval = kernel_read(interpreter, &loc->interp_elf_ex,
- sizeof(loc->interp_elf_ex), &pos);
- if (retval != sizeof(loc->interp_elf_ex)) {
- if (retval >= 0)
- retval = -EIO;
+ retval = elf_read(interpreter, &loc->interp_elf_ex,
+ sizeof(loc->interp_elf_ex), 0);
+ if (retval < 0)
goto out_free_dentry;
- }
break;
@@ -1054,11 +1053,8 @@ out_free_interp:
}
if (interpreter) {
- unsigned long interp_map_addr = 0;
-
elf_entry = load_elf_interp(&loc->interp_elf_ex,
interpreter,
- &interp_map_addr,
load_bias, interp_elf_phdata);
if (!IS_ERR((void *)elf_entry)) {
/*
@@ -1179,11 +1175,10 @@ static int load_elf_library(struct file *file)
unsigned long elf_bss, bss, len;
int retval, error, i, j;
struct elfhdr elf_ex;
- loff_t pos = 0;
error = -ENOEXEC;
- retval = kernel_read(file, &elf_ex, sizeof(elf_ex), &pos);
- if (retval != sizeof(elf_ex))
+ retval = elf_read(file, &elf_ex, sizeof(elf_ex), 0);
+ if (retval < 0)
goto out;
if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
@@ -1208,9 +1203,8 @@ static int load_elf_library(struct file *file)
eppnt = elf_phdata;
error = -ENOEXEC;
- pos = elf_ex.e_phoff;
- retval = kernel_read(file, eppnt, j, &pos);
- if (retval != j)
+ retval = elf_read(file, eppnt, j, elf_ex.e_phoff);
+ if (retval < 0)
goto out_free_ph;
for (j = 0, i = 0; i<elf_ex.e_phnum; i++)
diff --git a/fs/buffer.c b/fs/buffer.c
index 86a38b979323..131d39ec7d31 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -956,10 +956,20 @@ grow_dev_page(struct block_device *bdev, sector_t block,
end_block = init_page_buffers(page, bdev,
(sector_t)index << sizebits,
size);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x01;
+#endif
goto done;
}
- if (!try_to_free_buffers(page))
+ if (!try_to_free_buffers(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x02;
+#endif
goto failed;
+ }
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x04;
+#endif
}
/*
@@ -979,6 +989,9 @@ grow_dev_page(struct block_device *bdev, sector_t block,
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x08;
+#endif
failed:
unlock_page(page);
put_page(page);
@@ -1034,6 +1047,12 @@ __getblk_slow(struct block_device *bdev, sector_t block,
return NULL;
}
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_stamp = jiffies;
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+#endif
for (;;) {
struct buffer_head *bh;
int ret;
@@ -1045,6 +1064,24 @@ __getblk_slow(struct block_device *bdev, sector_t block,
ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
+
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ if (!time_after(jiffies, current->getblk_stamp + 3 * HZ))
+ continue;
+ printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx bdev_super_blocksize=%ld size=%u bdev_super_blocksize_bits=%d bdev_inode_blkbits=%d\n",
+ current->comm, current->pid, current->getblk_executed,
+ current->getblk_bh_count, current->getblk_bh_state,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1L :
+ bdev->bd_super->s_blocksize, size,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1 :
+ bdev->bd_super->s_blocksize_bits,
+ IS_ERR_OR_NULL(bdev->bd_inode) ? -1 :
+ bdev->bd_inode->i_blkbits);
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+ current->getblk_stamp = jiffies;
+#endif
}
}
@@ -3223,6 +3260,11 @@ EXPORT_SYMBOL(sync_dirty_buffer);
*/
static inline int buffer_busy(struct buffer_head *bh)
{
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x80;
+ current->getblk_bh_count = atomic_read(&bh->b_count);
+ current->getblk_bh_state = bh->b_state;
+#endif
return atomic_read(&bh->b_count) |
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
@@ -3261,11 +3303,18 @@ int try_to_free_buffers(struct page *page)
int ret = 0;
BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
+ if (PageWriteback(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x10;
+#endif
return 0;
+ }
if (mapping == NULL) { /* can this still happen? */
ret = drop_buffers(page, &buffers_to_free);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x20;
+#endif
goto out;
}
@@ -3289,6 +3338,9 @@ int try_to_free_buffers(struct page *page)
if (ret)
cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x40;
+#endif
out:
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index c4159bcc05d9..67a395039268 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -551,28 +551,23 @@ out_unlock:
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static struct nested_calls poll_safewake_ncalls;
-
-static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
-{
- unsigned long flags;
- wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie;
-
- spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1);
- wake_up_locked_poll(wqueue, EPOLLIN);
- spin_unlock_irqrestore(&wqueue->lock, flags);
-
- return 0;
-}
+static DEFINE_PER_CPU(int, wakeup_nest);
static void ep_poll_safewake(wait_queue_head_t *wq)
{
- int this_cpu = get_cpu();
-
- ep_call_nested(&poll_safewake_ncalls,
- ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
+ unsigned long flags;
+ int subclass;
- put_cpu();
+ local_irq_save(flags);
+ preempt_disable();
+ subclass = __this_cpu_read(wakeup_nest);
+ spin_lock_nested(&wq->lock, subclass + 1);
+ __this_cpu_inc(wakeup_nest);
+ wake_up_locked_poll(wq, POLLIN);
+ __this_cpu_dec(wakeup_nest);
+ spin_unlock(&wq->lock);
+ local_irq_restore(flags);
+ preempt_enable();
}
#else
@@ -671,7 +666,6 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
void *priv, int depth, bool ep_locked)
{
__poll_t res;
- int pwake = 0;
struct epitem *epi, *nepi;
LIST_HEAD(txlist);
@@ -738,26 +732,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
*/
list_splice(&txlist, &ep->rdllist);
__pm_relax(ep->ws);
-
- if (!list_empty(&ep->rdllist)) {
- /*
- * Wake up (if active) both the eventpoll wait list and
- * the ->poll() wait list (delayed after we release the lock).
- */
- if (waitqueue_active(&ep->wq))
- wake_up(&ep->wq);
- if (waitqueue_active(&ep->poll_wait))
- pwake++;
- }
write_unlock_irq(&ep->lock);
if (!ep_locked)
mutex_unlock(&ep->mtx);
- /* We have to call this outside the lock */
- if (pwake)
- ep_poll_safewake(&ep->poll_wait);
-
return res;
}
@@ -2370,11 +2349,6 @@ static int __init eventpoll_init(void)
*/
ep_nested_calls_init(&poll_loop_ncalls);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /* Initialize the structure used to perform safe poll wait head wake ups */
- ep_nested_calls_init(&poll_safewake_ncalls);
-#endif
-
/*
* We can have many thousands of epitems, so prevent this from
* using an extra cache line on 64-bit (and smaller) CPUs
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a478df035651..b9561bbf1dd9 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -440,7 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
u32 hash;
index = page->index;
- hash = hugetlb_fault_mutex_hash(h, mapping, index, 0);
+ hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/*
@@ -644,7 +644,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
addr = index * hpage_size;
/* mutex taken here, fault path and hole punch */
- hash = hugetlb_fault_mutex_hash(h, mapping, index, addr);
+ hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/* See if already present in mapping to avoid alloc/free */
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 3e7da392aa6f..bb981ec76456 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -327,8 +327,8 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh)
down_read(&OCFS2_I(inode)->ip_xattr_sem);
acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
up_read(&OCFS2_I(inode)->ip_xattr_sem);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
+ if (IS_ERR_OR_NULL(acl))
+ return PTR_ERR_OR_ZERO(acl);
ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
if (ret)
return ret;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 53939bf9d7d2..9876db52913a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2098,53 +2098,89 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
return 0;
}
-static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
- struct file *file,
- loff_t pos, size_t count,
- int *meta_level)
+static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
+ struct buffer_head **di_bh,
+ int meta_level,
+ int overwrite_io,
+ int write_sem,
+ int wait)
{
- int ret;
- struct buffer_head *di_bh = NULL;
- u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
- u32 clusters =
- ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+ int ret = 0;
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret) {
- mlog_errno(ret);
+ if (wait)
+ ret = ocfs2_inode_lock(inode, NULL, meta_level);
+ else
+ ret = ocfs2_try_inode_lock(inode,
+ overwrite_io ? NULL : di_bh, meta_level);
+ if (ret < 0)
goto out;
+
+ if (wait) {
+ if (write_sem)
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+ else
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ } else {
+ if (write_sem)
+ ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+ else
+ ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+
+ if (!ret) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
}
- *meta_level = 1;
+ return ret;
- ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
- if (ret)
- mlog_errno(ret);
+out_unlock:
+ brelse(*di_bh);
+ ocfs2_inode_unlock(inode, meta_level);
out:
- brelse(di_bh);
return ret;
}
+static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
+ struct buffer_head **di_bh,
+ int meta_level,
+ int write_sem)
+{
+ if (write_sem)
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ else
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ brelse(*di_bh);
+ *di_bh = NULL;
+
+ if (meta_level >= 0)
+ ocfs2_inode_unlock(inode, meta_level);
+}
+
static int ocfs2_prepare_inode_for_write(struct file *file,
loff_t pos, size_t count, int wait)
{
int ret = 0, meta_level = 0, overwrite_io = 0;
+ int write_sem = 0;
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = d_inode(dentry);
struct buffer_head *di_bh = NULL;
+ u32 cpos;
+ u32 clusters;
/*
* We start with a read level meta lock and only jump to an ex
* if we need to make modifications here.
*/
for(;;) {
- if (wait)
- ret = ocfs2_inode_lock(inode, NULL, meta_level);
- else
- ret = ocfs2_try_inode_lock(inode,
- overwrite_io ? NULL : &di_bh, meta_level);
+ ret = ocfs2_inode_lock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ overwrite_io,
+ write_sem,
+ wait);
if (ret < 0) {
- meta_level = -1;
if (ret != -EAGAIN)
mlog_errno(ret);
goto out;
@@ -2156,15 +2192,8 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
*/
if (!wait && !overwrite_io) {
overwrite_io = 1;
- if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
- ret = -EAGAIN;
- goto out_unlock;
- }
ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
- brelse(di_bh);
- di_bh = NULL;
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
if (ret < 0) {
if (ret != -EAGAIN)
mlog_errno(ret);
@@ -2183,7 +2212,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
* set inode->i_size at the end of a write. */
if (should_remove_suid(dentry)) {
if (meta_level == 0) {
- ocfs2_inode_unlock(inode, meta_level);
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem);
meta_level = 1;
continue;
}
@@ -2197,18 +2229,32 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
ret = ocfs2_check_range_for_refcount(inode, pos, count);
if (ret == 1) {
- ocfs2_inode_unlock(inode, meta_level);
- meta_level = -1;
-
- ret = ocfs2_prepare_inode_for_refcount(inode,
- file,
- pos,
- count,
- &meta_level);
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem);
+ ret = ocfs2_inode_lock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ overwrite_io,
+ 1,
+ wait);
+ write_sem = 1;
+ if (ret < 0) {
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ clusters =
+ ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+ ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
}
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
goto out_unlock;
}
@@ -2219,10 +2265,10 @@ out_unlock:
trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
pos, count, wait);
- brelse(di_bh);
-
- if (meta_level >= 0)
- ocfs2_inode_unlock(inode, meta_level);
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem);
out:
return ret;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 64e9ee1b129e..074e9585c699 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -138,8 +138,12 @@ static int proc_getattr(const struct path *path, struct kstat *stat,
{
struct inode *inode = d_inode(path->dentry);
struct proc_dir_entry *de = PDE(inode);
- if (de && de->nlink)
- set_nlink(inode, de->nlink);
+ if (de) {
+ nlink_t nlink = READ_ONCE(de->nlink);
+ if (nlink > 0) {
+ set_nlink(inode, nlink);
+ }
+ }
generic_fillattr(inode, stat);
return 0;
@@ -159,7 +163,6 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
{
const char *cp = name, *next;
struct proc_dir_entry *de;
- unsigned int len;
de = *ret;
if (!de)
@@ -170,13 +173,12 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
if (!next)
break;
- len = next - cp;
- de = pde_subdir_find(de, cp, len);
+ de = pde_subdir_find(de, cp, next - cp);
if (!de) {
WARN(1, "name '%s'\n", name);
return -ENOENT;
}
- cp += len + 1;
+ cp = next + 1;
}
*residual = cp;
*ret = de;
@@ -362,6 +364,7 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
write_unlock(&proc_subdir_lock);
goto out_free_inum;
}
+ dir->nlink++;
write_unlock(&proc_subdir_lock);
return dp;
@@ -472,10 +475,7 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
ent->data = data;
ent->proc_fops = &proc_dir_operations;
ent->proc_iops = &proc_dir_inode_operations;
- parent->nlink++;
ent = proc_register(parent, ent);
- if (!ent)
- parent->nlink--;
}
return ent;
}
@@ -505,10 +505,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
ent->data = NULL;
ent->proc_fops = NULL;
ent->proc_iops = NULL;
- parent->nlink++;
ent = proc_register(parent, ent);
- if (!ent)
- parent->nlink--;
}
return ent;
}
@@ -666,8 +663,12 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
len = strlen(fn);
de = pde_subdir_find(parent, fn, len);
- if (de)
+ if (de) {
rb_erase(&de->subdir_node, &parent->subdir);
+ if (S_ISDIR(de->mode)) {
+ parent->nlink--;
+ }
+ }
write_unlock(&proc_subdir_lock);
if (!de) {
WARN(1, "name '%s'\n", name);
@@ -676,9 +677,6 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
proc_entry_rundown(de);
- if (S_ISDIR(de->mode))
- parent->nlink--;
- de->nlink = 0;
WARN(pde_subdir_first(de),
"%s: removing non-empty directory '%s/%s', leaking at least '%s'\n",
__func__, de->parent->name, de->name, pde_subdir_first(de)->name);
@@ -714,13 +712,12 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
de = next;
continue;
}
- write_unlock(&proc_subdir_lock);
-
- proc_entry_rundown(de);
next = de->parent;
if (S_ISDIR(de->mode))
next->nlink--;
- de->nlink = 0;
+ write_unlock(&proc_subdir_lock);
+
+ proc_entry_rundown(de);
if (de == root)
break;
pde_put(de);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index cd0c8d5ce9a1..0f3b557c9b77 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -197,8 +197,8 @@ extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, lof
* inode.c
*/
struct pde_opener {
- struct file *file;
struct list_head lh;
+ struct file *file;
bool closing;
struct completion *c;
} __randomize_layout;
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7c952ee732e6..e40dbfe1168e 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -97,7 +97,10 @@ u64 stable_page_flags(struct page *page)
* it differentiates a memory hole from a page with no flags
*/
if (!page)
- return 1 << KPF_NOPAGE;
+ return BIT_ULL(KPF_NOPAGE);
+
+ if (pfn_zone_device_reserved(page_to_pfn(page)))
+ return BIT_ULL(KPF_RESERVED);
k = page->flags;
u = 0;
@@ -109,22 +112,22 @@ u64 stable_page_flags(struct page *page)
* simple test in page_mapped() is not enough.
*/
if (!PageSlab(page) && page_mapped(page))
- u |= 1 << KPF_MMAP;
+ u |= BIT_ULL(KPF_MMAP);
if (PageAnon(page))
- u |= 1 << KPF_ANON;
+ u |= BIT_ULL(KPF_ANON);
if (PageKsm(page))
- u |= 1 << KPF_KSM;
+ u |= BIT_ULL(KPF_KSM);
/*
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
if (PageHead(page))
- u |= 1 << KPF_COMPOUND_HEAD;
+ u |= BIT_ULL(KPF_COMPOUND_HEAD);
if (PageTail(page))
- u |= 1 << KPF_COMPOUND_TAIL;
+ u |= BIT_ULL(KPF_COMPOUND_TAIL);
if (PageHuge(page))
- u |= 1 << KPF_HUGE;
+ u |= BIT_ULL(KPF_HUGE);
/*
* PageTransCompound can be true for non-huge compound pages (slab
* pages or pages allocated by drivers with __GFP_COMP) because it
@@ -135,14 +138,13 @@ u64 stable_page_flags(struct page *page)
struct page *head = compound_head(page);
if (PageLRU(head) || PageAnon(head))
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_THP);
else if (is_huge_zero_page(head)) {
- u |= 1 << KPF_ZERO_PAGE;
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_ZERO_PAGE);
+ u |= BIT_ULL(KPF_THP);
}
} else if (is_zero_pfn(page_to_pfn(page)))
- u |= 1 << KPF_ZERO_PAGE;
-
+ u |= BIT_ULL(KPF_ZERO_PAGE);
/*
* Caveats on high order pages: page->_refcount will only be set
@@ -150,23 +152,23 @@ u64 stable_page_flags(struct page *page)
* SLOB won't set PG_slab at all on compound pages.
*/
if (PageBuddy(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
else if (page_count(page) == 0 && is_free_buddy_page(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
if (PageOffline(page))
- u |= 1 << KPF_OFFLINE;
+ u |= BIT_ULL(KPF_OFFLINE);
if (PageTable(page))
- u |= 1 << KPF_PGTABLE;
+ u |= BIT_ULL(KPF_PGTABLE);
if (page_is_idle(page))
- u |= 1 << KPF_IDLE;
+ u |= BIT_ULL(KPF_IDLE);
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
if (PageTail(page) && PageSlab(compound_head(page)))
- u |= 1 << KPF_SLAB;
+ u |= BIT_ULL(KPF_SLAB);
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
@@ -179,7 +181,7 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
if (PageSwapCache(page))
- u |= 1 << KPF_SWAPCACHE;
+ u |= BIT_ULL(KPF_SWAPCACHE);
u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d82636e8eb65..35624ca2a2f9 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -147,6 +147,17 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
return error;
}
+static int ramfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+
+ inode = ramfs_get_inode(dir->i_sb, dir, mode, 0);
+ if (!inode)
+ return -ENOSPC;
+ d_tmpfile(dentry, inode);
+ return 0;
+}
+
static const struct inode_operations ramfs_dir_inode_operations = {
.create = ramfs_create,
.lookup = simple_lookup,
@@ -157,6 +168,7 @@ static const struct inode_operations ramfs_dir_inode_operations = {
.rmdir = simple_rmdir,
.mknod = ramfs_mknod,
.rename = simple_rename,
+ .tmpfile = ramfs_tmpfile,
};
/*
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index d7f54e535294..f18e6e828b21 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1460,7 +1460,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
start = vma->vm_start;
vma_end = min(end, vma->vm_end);
- new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
+ new_flags = (vma->vm_flags &
+ ~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags;
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
index e3667c9a33a5..c86cf7cb4bba 100644
--- a/include/asm-generic/4level-fixup.h
+++ b/include/asm-generic/4level-fixup.h
@@ -30,7 +30,6 @@
#undef pud_free_tlb
#define pud_free_tlb(tlb, x, addr) do { } while (0)
#define pud_free(mm, x) do { } while (0)
-#define __pud_free_tlb(tlb, x, addr) do { } while (0)
#undef pud_addr_end
#define pud_addr_end(addr, end) (end)
diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
index f6947da70d71..4c74b1c1d13b 100644
--- a/include/asm-generic/5level-fixup.h
+++ b/include/asm-generic/5level-fixup.h
@@ -51,7 +51,6 @@ static inline int p4d_present(p4d_t p4d)
#undef p4d_free_tlb
#define p4d_free_tlb(tlb, x, addr) do { } while (0)
#define p4d_free(mm, x) do { } while (0)
-#define __p4d_free_tlb(tlb, x, addr) do { } while (0)
#undef p4d_addr_end
#define p4d_addr_end(addr, end) (end)
diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h
index aebab905e6cd..ce2cbb3c380f 100644
--- a/include/asm-generic/pgtable-nop4d.h
+++ b/include/asm-generic/pgtable-nop4d.h
@@ -50,7 +50,7 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
*/
#define p4d_alloc_one(mm, address) NULL
#define p4d_free(mm, x) do { } while (0)
-#define __p4d_free_tlb(tlb, x, a) do { } while (0)
+#define p4d_free_tlb(tlb, x, a) do { } while (0)
#undef p4d_addr_end
#define p4d_addr_end(addr, end) (end)
diff --git a/include/asm-generic/pgtable-nopmd.h b/include/asm-generic/pgtable-nopmd.h
index b85b8271a73d..0d9b28cba16d 100644
--- a/include/asm-generic/pgtable-nopmd.h
+++ b/include/asm-generic/pgtable-nopmd.h
@@ -60,7 +60,7 @@ static inline pmd_t * pmd_offset(pud_t * pud, unsigned long address)
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
}
-#define __pmd_free_tlb(tlb, x, a) do { } while (0)
+#define pmd_free_tlb(tlb, x, a) do { } while (0)
#undef pmd_addr_end
#define pmd_addr_end(addr, end) (end)
diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h
index c77a1d301155..d3776cb494c0 100644
--- a/include/asm-generic/pgtable-nopud.h
+++ b/include/asm-generic/pgtable-nopud.h
@@ -59,7 +59,7 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
*/
#define pud_alloc_one(mm, address) NULL
#define pud_free(mm, x) do { } while (0)
-#define __pud_free_tlb(tlb, x, a) do { } while (0)
+#define pud_free_tlb(tlb, x, a) do { } while (0)
#undef pud_addr_end
#define pud_addr_end(addr, end) (end)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 818691846c90..9cdcbc7c0b7b 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -558,8 +558,19 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
* Do the tests inline, but report and clear the bad entry in mm/memory.c.
*/
void pgd_clear_bad(pgd_t *);
+
+#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *);
+#else
+#define p4d_clear_bad(p4d) do { } while (0)
+#endif
+
+#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *);
+#else
+#define pud_clear_bad(p4d) do { } while (0)
+#endif
+
void pmd_clear_bad(pmd_t *);
static inline int pgd_none_or_clear_bad(pgd_t *pgd)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 04c0644006fd..05dddc17522b 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -584,7 +584,6 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
} while (0)
#endif
-#ifndef __ARCH_HAS_4LEVEL_HACK
#ifndef pud_free_tlb
#define pud_free_tlb(tlb, pudp, address) \
do { \
@@ -594,9 +593,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
__pud_free_tlb(tlb, pudp, address); \
} while (0)
#endif
-#endif
-#ifndef __ARCH_HAS_5LEVEL_HACK
#ifndef p4d_free_tlb
#define p4d_free_tlb(tlb, pudp, address) \
do { \
@@ -605,7 +602,6 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
__p4d_free_tlb(tlb, pudp, address); \
} while (0)
#endif
-#endif
#endif /* CONFIG_MMU */
diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h
index 0fe5426f2bdc..e3a0be2c90ad 100644
--- a/include/linux/build_bug.h
+++ b/include/linux/build_bug.h
@@ -9,11 +9,11 @@
#else /* __CHECKER__ */
/*
* Force a compilation error if condition is true, but also produce a
- * result (of value 0 and type size_t), so the expression can be used
+ * result (of value 0 and type int), so the expression can be used
* e.g. in a structure initializer (or where-ever else comma expressions
* aren't permitted).
*/
-#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:(-!!(e)); }))
+#define BUILD_BUG_ON_ZERO(e) ((int)(sizeof(struct { int:(-!!(e)); })))
#endif /* __CHECKER__ */
/* Force a compilation error if a constant expression is not a power of 2 */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 61f2f6ff9467..e5b817cb86e7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -612,6 +612,8 @@ static inline bool pm_suspended_storage(void)
/* The below functions must be run on a range from a single zone. */
extern int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype, gfp_t gfp_mask);
+extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask);
#endif
void free_contig_range(unsigned long pfn, unsigned int nr_pages);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 53fc34f930d0..4c5a16be7912 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -105,8 +105,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
void free_huge_page(struct page *page);
void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
- pgoff_t idx, unsigned long address);
+u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f491690d54c6..b38bbefabfab 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -358,6 +358,9 @@ static inline phys_addr_t memblock_phys_alloc(phys_addr_t size,
MEMBLOCK_ALLOC_ACCESSIBLE);
}
+void *memblock_alloc_exact_nid_raw(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid);
void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
int nid);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ae703ea3ef48..e82928deea88 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -58,7 +58,6 @@ enum mem_cgroup_protection {
struct mem_cgroup_reclaim_cookie {
pg_data_t *pgdat;
- int priority;
unsigned int generation;
};
@@ -112,7 +111,7 @@ struct memcg_shrinker_map {
};
/*
- * per-zone information in memory controller.
+ * per-node information in memory controller.
*/
struct mem_cgroup_per_node {
struct lruvec lruvec;
@@ -126,7 +125,7 @@ struct mem_cgroup_per_node {
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
- struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
+ struct mem_cgroup_reclaim_iter iter;
struct memcg_shrinker_map __rcu *shrinker_map;
@@ -399,8 +398,7 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
* @memcg: memcg of the wanted lruvec
*
* Returns the lru list vector holding pages for a given @node or a given
- * @memcg and @zone. This can be the node lruvec, if the memory controller
- * is disabled.
+ * @memcg. This can be the node lruvec, if the memory controller is disabled.
*/
static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
struct mem_cgroup *memcg)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index f46ea71b4ffd..384ffb3d69ab 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -102,13 +102,10 @@ extern unsigned long __offline_isolated_pages(unsigned long start_pfn,
typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
+extern void generic_online_page(struct page *page, unsigned int order);
extern int set_online_page_callback(online_page_callback_t callback);
extern int restore_online_page_callback(online_page_callback_t callback);
-extern void __online_page_set_limits(struct page *page);
-extern void __online_page_increment_counters(struct page *page);
-extern void __online_page_free(struct page *page);
-
extern int try_online_node(int nid);
extern int arch_add_memory(int nid, u64 start, u64 size,
@@ -125,8 +122,8 @@ static inline bool movable_node_is_enabled(void)
extern void arch_remove_memory(int nid, u64 start, u64 size,
struct vmem_altmap *altmap);
-extern void __remove_pages(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap);
+extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap);
/* reasonably generic interface to expand the physical pages */
extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
@@ -345,6 +342,9 @@ extern int add_memory(int nid, u64 start, u64 size);
extern int add_memory_resource(int nid, struct resource *resource);
extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages, struct vmem_altmap *altmap);
+extern void remove_pfn_range_from_zone(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long nr_pages);
extern bool is_memblock_offlined(struct memory_block *mem);
extern int sparse_add_section(int nid, unsigned long pfn,
unsigned long nr_pages, struct vmem_altmap *altmap);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 6fefb09af7c3..c676e33205d3 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -123,6 +123,7 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
}
#ifdef CONFIG_ZONE_DEVICE
+bool pfn_zone_device_reserved(unsigned long pfn);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
@@ -133,6 +134,11 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
#else
+static inline bool pfn_zone_device_reserved(unsigned long pfn)
+{
+ return false;
+}
+
static inline void *devm_memremap_pages(struct device *dev,
struct dev_pagemap *pgmap)
{
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cc292273e6ba..82e4aaa8bef9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1648,19 +1648,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
return (unsigned long)val;
}
+void mm_trace_rss_stat(int member, long count);
+
static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
- atomic_long_add(value, &mm->rss_stat.count[member]);
+ long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
+
+ mm_trace_rss_stat(member, count);
}
static inline void inc_mm_counter(struct mm_struct *mm, int member)
{
- atomic_long_inc(&mm->rss_stat.count[member]);
+ long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
+
+ mm_trace_rss_stat(member, count);
}
static inline void dec_mm_counter(struct mm_struct *mm, int member)
{
- atomic_long_dec(&mm->rss_stat.count[member]);
+ long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
+
+ mm_trace_rss_stat(member, count);
}
/* Optimized variant when page is already known not to be PageAnon */
@@ -2219,9 +2227,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
extern void setup_per_cpu_pageset(void);
-extern void zone_pcp_update(struct zone *zone);
-extern void zone_pcp_reset(struct zone *zone);
-
/* page_alloc.c */
extern int min_free_kbytes;
extern int watermark_boost_factor;
@@ -2786,7 +2791,7 @@ extern int sysctl_memory_failure_early_kill;
extern int sysctl_memory_failure_recovery;
extern void shake_page(struct page *p, int access);
extern atomic_long_t num_poisoned_pages __read_mostly;
-extern int soft_offline_page(struct page *page, int flags);
+extern int soft_offline_page(unsigned long pfn, int flags);
/*
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b0a36d1580b6..d4ca03b93373 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1079,7 +1079,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
/**
* for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
* @zone - The current zone in the iterator
- * @z - The current pointer within zonelist->zones being iterated
+ * @z - The current pointer within zonelist->_zonerefs being iterated
* @zlist - The zonelist being iterated
* @highidx - The zone index of the highest zone to return
* @nodemask - Nodemask allowed by the allocator
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 0096a05395e3..018947611483 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -150,10 +150,6 @@ extern int raw_notifier_chain_register(struct raw_notifier_head *nh,
extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
struct notifier_block *nb);
-extern int blocking_notifier_chain_cond_register(
- struct blocking_notifier_head *nh,
- struct notifier_block *nb);
-
extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
struct notifier_block *nb);
extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f91cb8898ff0..3b8e5c5f7e1f 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -610,27 +610,6 @@ static inline int PageTransCompound(struct page *page)
}
/*
- * PageTransCompoundMap is the same as PageTransCompound, but it also
- * guarantees the primary MMU has the entire compound page mapped
- * through pmd_trans_huge, which in turn guarantees the secondary MMUs
- * can also map the entire compound page. This allows the secondary
- * MMUs to call get_user_pages() only once for each compound page and
- * to immediately map the entire compound page with a single secondary
- * MMU fault. If there will be a pmd split later, the secondary MMUs
- * will get an update through the MMU notifier invalidation through
- * split_huge_pmd().
- *
- * Unlike PageTransCompound, this is safe to be called only while
- * split_huge_pmd() cannot run from under us, like if protected by the
- * MMU notifier, otherwise it may result in page->_mapcount < 0 false
- * positives.
- */
-static inline int PageTransCompoundMap(struct page *page)
-{
- return PageTransCompound(page) && atomic_read(&page->_mapcount) < 0;
-}
-
-/*
* PageTransTail returns true for both transparent huge pages
* and hugetlbfs pages, so it should only be called when it's known
* that hugetlbfs pages aren't involved.
@@ -681,6 +660,39 @@ static inline int TestClearPageDoubleMap(struct page *page)
return test_and_clear_bit(PG_double_map, &page[1].flags);
}
+/*
+ * PageTransCompoundMap is the same as PageTransCompound, but it also
+ * guarantees the primary MMU has the entire compound page mapped
+ * through pmd_trans_huge, which in turn guarantees the secondary MMUs
+ * can also map the entire compound page. This allows the secondary
+ * MMUs to call get_user_pages() only once for each compound page and
+ * to immediately map the entire compound page with a single secondary
+ * MMU fault. If there will be a pmd split later, the secondary MMUs
+ * will get an update through the MMU notifier invalidation through
+ * split_huge_pmd().
+ *
+ * Unlike PageTransCompound, this is safe to be called only while
+ * split_huge_pmd() cannot run from under us, like if protected by the
+ * MMU notifier, otherwise it may result in page->_mapcount check false
+ * positives.
+ *
+ * We have to treat page cache THP differently since every subpage of it
+ * would get _mapcount inc'ed once it is PMD mapped. But, it may be PTE
+ * mapped in the current process so checking PageDoubleMap flag to rule
+ * this out.
+ */
+static inline int PageTransCompoundMap(struct page *page)
+{
+ bool pmd_mapped;
+
+ if (PageAnon(page))
+ pmd_mapped = atomic_read(&page->_mapcount) < 0;
+ else
+ pmd_mapped = atomic_read(&page->_mapcount) >= 0 &&
+ !PageDoubleMap(compound_head(page));
+
+ return PageTransCompound(page) && pmd_mapped;
+}
#else
TESTPAGEFLAG_FALSE(TransHuge)
TESTPAGEFLAG_FALSE(TransCompound)
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 1099c2fee20f..6861df759fad 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -30,7 +30,7 @@ static inline bool is_migrate_isolate(int migratetype)
}
#endif
-#define SKIP_HWPOISON 0x1
+#define MEMORY_OFFLINE 0x1
#define REPORT_FAILURE 0x2
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
@@ -58,7 +58,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
* Test all pages in [start_pfn, end_pfn) are isolated or not.
*/
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
- bool skip_hwpoisoned_pages);
+ int isol_flags);
struct page *alloc_migrate_target(struct page *page, unsigned long private);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index a705aa2d03f9..0640be56dcbd 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -58,8 +58,8 @@ extern int remove_proc_subtree(const char *, struct proc_dir_entry *);
struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
struct proc_dir_entry *parent, const struct seq_operations *ops,
unsigned int state_size, void *data);
-#define proc_create_net(name, mode, parent, state_size, ops) \
- proc_create_net_data(name, mode, parent, state_size, ops, NULL)
+#define proc_create_net(name, mode, parent, ops, state_size) \
+ proc_create_net_data(name, mode, parent, ops, state_size, NULL)
struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
struct proc_dir_entry *parent,
int (*show)(struct seq_file *, void *), void *data);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f72984f94a5c..d6c80e7991df 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -905,6 +905,7 @@ struct task_struct {
#ifdef CONFIG_DETECT_HUNG_TASK
unsigned long last_switch_count;
unsigned long last_switch_time;
+ unsigned long killed_time;
#endif
/* Filesystem information: */
struct fs_struct *fs;
@@ -1251,6 +1252,7 @@ struct task_struct {
#ifdef CONFIG_MMU
struct task_struct *oom_reaper_list;
#endif
+ struct list_head oom_victim_list;
#ifdef CONFIG_VMAP_STACK
struct vm_struct *stack_vm_area;
#endif
@@ -1271,6 +1273,13 @@ struct task_struct {
unsigned long prev_lowest_stack;
#endif
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ unsigned long getblk_stamp;
+ unsigned int getblk_executed;
+ unsigned int getblk_bh_count;
+ unsigned long getblk_bh_state;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index d4f6215ee03f..89f55e914673 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -12,6 +12,7 @@ extern unsigned int sysctl_hung_task_panic;
extern unsigned long sysctl_hung_task_timeout_secs;
extern unsigned long sysctl_hung_task_check_interval_secs;
extern int sysctl_hung_task_warnings;
+extern int sysctl_hung_task_interval_warnings;
extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 6d52953dbdf0..270f03ee3f83 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -561,26 +561,6 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
return __kmalloc(size, flags);
}
-/*
- * Determine size used for the nth kmalloc cache.
- * return size or 0 if a kmalloc cache for that
- * size does not exist
- */
-static __always_inline unsigned int kmalloc_size(unsigned int n)
-{
-#ifndef CONFIG_SLOB
- if (n > 2)
- return 1U << n;
-
- if (n == 1 && KMALLOC_MIN_SIZE <= 32)
- return 96;
-
- if (n == 2 && KMALLOC_MIN_SIZE <= 64)
- return 192;
-#endif
- return 0;
-}
-
static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
{
#ifndef CONFIG_SLOB
diff --git a/include/linux/string.h b/include/linux/string.h
index b6ccdc2c7f02..f516cec5277c 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -35,6 +35,51 @@ ssize_t strscpy(char *, const char *, size_t);
/* Wraps calls to strscpy()/memset(), no arch specific code required */
ssize_t strscpy_pad(char *dest, const char *src, size_t count);
+/**
+ * stracpy - Copy a C-string into an array of char/u8/s8 or equivalent
+ * @dest: Where to copy the string, must be an array of char and not a pointer
+ * @src: String to copy, may be a pointer or const char array
+ *
+ * Helper for strscpy().
+ * Copies a maximum of sizeof(@dest) bytes of @src with %NUL termination.
+ *
+ * Returns:
+ * * The number of characters copied (not including the trailing %NUL)
+ * * -E2BIG if @dest is a zero size array or @src was truncated.
+ */
+#define stracpy(dest, src) \
+({ \
+ size_t count = ARRAY_SIZE(dest); \
+ BUILD_BUG_ON(!(__same_type(dest, char[]) || \
+ __same_type(dest, unsigned char[]) || \
+ __same_type(dest, signed char[]))); \
+ \
+ strscpy(dest, src, count); \
+})
+
+/**
+ * stracpy_pad - Copy a C-string into an array of char/u8/s8 with %NUL padding
+ * @dest: Where to copy the string, must be an array of char and not a pointer
+ * @src: String to copy, may be a pointer or const char array
+ *
+ * Helper for strscpy_pad().
+ * Copies a maximum of sizeof(@dest) bytes of @src with %NUL termination
+ * and zero-pads the remaining size of @dest
+ *
+ * Returns:
+ * * The number of characters copied (not including the trailing %NUL)
+ * * -E2BIG if @dest is a zero size array or @src was truncated.
+ */
+#define stracpy_pad(dest, src) \
+({ \
+ size_t count = ARRAY_SIZE(dest); \
+ BUILD_BUG_ON(!(__same_type(dest, char[]) || \
+ __same_type(dest, unsigned char[]) || \
+ __same_type(dest, signed char[]))); \
+ \
+ strscpy_pad(dest, src, count); \
+})
+
#ifndef __HAVE_ARCH_STRCAT
extern char * strcat(char *, const char *);
#endif
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 6df477329b76..02fa84493f23 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -120,8 +120,7 @@ static inline void *proc_sys_poll_event(struct ctl_table_poll *poll)
struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name)
/* A sysctl table is an array of struct ctl_table: */
-struct ctl_table
-{
+struct ctl_table {
const char *procname; /* Text ID for /proc/sys, or zero */
void *data;
int maxlen;
@@ -140,8 +139,7 @@ struct ctl_node {
/* struct ctl_table_header is used to maintain dynamic lists of
struct ctl_table trees. */
-struct ctl_table_header
-{
+struct ctl_table_header {
union {
struct {
struct ctl_table *ctl_table;
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 659a4400517b..e93e249a4e9b 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -147,6 +147,8 @@ check_copy_size(const void *addr, size_t bytes, bool is_source)
__bad_copy_to();
return false;
}
+ if (WARN_ON_ONCE(bytes > INT_MAX))
+ return false;
check_object_size(addr, bytes, is_source);
return true;
}
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index bdeda4b079fe..292485f3d24d 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -31,6 +31,12 @@ struct reclaim_stat {
unsigned nr_unmap_fail;
};
+enum writeback_stat_item {
+ NR_DIRTY_THRESHOLD,
+ NR_DIRTY_BG_THRESHOLD,
+ NR_VM_WRITEBACK_STAT_ITEMS,
+};
+
#ifdef CONFIG_VM_EVENT_COUNTERS
/*
* Light weight per cpu counter implementation.
@@ -381,4 +387,48 @@ static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
extern const char * const vmstat_text[];
+static inline const char *zone_stat_name(enum zone_stat_item item)
+{
+ return vmstat_text[item];
+}
+
+#ifdef CONFIG_NUMA
+static inline const char *numa_stat_name(enum numa_stat_item item)
+{
+ return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
+ item];
+}
+#endif /* CONFIG_NUMA */
+
+static inline const char *node_stat_name(enum node_stat_item item)
+{
+ return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
+ NR_VM_NUMA_STAT_ITEMS +
+ item];
+}
+
+static inline const char *lru_list_name(enum lru_list lru)
+{
+ return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
+}
+
+static inline const char *writeback_stat_name(enum writeback_stat_item item)
+{
+ return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
+ NR_VM_NUMA_STAT_ITEMS +
+ NR_VM_NODE_STAT_ITEMS +
+ item];
+}
+
+#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
+static inline const char *vm_event_name(enum vm_event_item item)
+{
+ return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
+ NR_VM_NUMA_STAT_ITEMS +
+ NR_VM_NODE_STAT_ITEMS +
+ NR_VM_WRITEBACK_STAT_ITEMS +
+ item];
+}
+#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
+
#endif /* _LINUX_VMSTAT_H */
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 3eb7cae8206c..3647d1008d88 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -540,7 +540,7 @@ do { \
({ \
int __ret = 0; \
might_sleep(); \
- if (!(condition)) \
+ if (!(condition) && (timeout)) \
__ret = __wait_event_hrtimeout(wq_head, condition, timeout, \
TASK_UNINTERRUPTIBLE); \
__ret; \
@@ -566,7 +566,7 @@ do { \
({ \
long __ret = 0; \
might_sleep(); \
- if (!(condition)) \
+ if (!(condition) && (timeout)) \
__ret = __wait_event_hrtimeout(wq, condition, timeout, \
TASK_INTERRUPTIBLE); \
__ret; \
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 69e8bb8963db..5a0666bfcf85 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -316,6 +316,27 @@ TRACE_EVENT(mm_page_alloc_extfrag,
__entry->change_ownership)
);
+TRACE_EVENT(rss_stat,
+
+ TP_PROTO(int member,
+ long count),
+
+ TP_ARGS(member, count),
+
+ TP_STRUCT__entry(
+ __field(int, member)
+ __field(long, size)
+ ),
+
+ TP_fast_assign(
+ __entry->member = member;
+ __entry->size = (count << PAGE_SHIFT);
+ ),
+
+ TP_printk("member=%d size=%ldB",
+ __entry->member,
+ __entry->size)
+ );
#endif /* _TRACE_KMEM_H */
/* This part must be outside protection */
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index a5ab2973e8dc..c37e2280e6dd 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -127,18 +127,43 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_b
);
#ifdef CONFIG_MEMCG
-DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,
+DECLARE_EVENT_CLASS(mm_vmscan_memcg_reclaim_begin_template,
- TP_PROTO(int order, gfp_t gfp_flags),
+ TP_PROTO(unsigned int cgroup_ino, int order, gfp_t gfp_flags),
- TP_ARGS(order, gfp_flags)
+ TP_ARGS(cgroup_ino, order, gfp_flags),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, cgroup_ino)
+ __field(int, order)
+ __field(gfp_t, gfp_flags)
+ ),
+
+ TP_fast_assign(
+ __entry->cgroup_ino = cgroup_ino;
+ __entry->order = order;
+ __entry->gfp_flags = gfp_flags;
+ ),
+
+ TP_printk("cgroup_ino=%u order=%d gfp_flags=%s",
+ __entry->cgroup_ino, __entry->order,
+ show_gfp_flags(__entry->gfp_flags))
);
-DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin,
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template,
+ mm_vmscan_memcg_reclaim_begin,
- TP_PROTO(int order, gfp_t gfp_flags),
+ TP_PROTO(unsigned int cgroup_ino, int order, gfp_t gfp_flags),
- TP_ARGS(order, gfp_flags)
+ TP_ARGS(cgroup_ino, order, gfp_flags)
+);
+
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template,
+ mm_vmscan_memcg_softlimit_reclaim_begin,
+
+ TP_PROTO(unsigned int cgroup_ino, int order, gfp_t gfp_flags),
+
+ TP_ARGS(cgroup_ino, order, gfp_flags)
);
#endif /* CONFIG_MEMCG */
@@ -167,18 +192,40 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end
);
#ifdef CONFIG_MEMCG
-DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_reclaim_end,
+DECLARE_EVENT_CLASS(mm_vmscan_memcg_reclaim_end_template,
- TP_PROTO(unsigned long nr_reclaimed),
+ TP_PROTO(unsigned int cgroup_ino, unsigned long nr_reclaimed),
- TP_ARGS(nr_reclaimed)
+ TP_ARGS(cgroup_ino, nr_reclaimed),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, cgroup_ino)
+ __field(unsigned long, nr_reclaimed)
+ ),
+
+ TP_fast_assign(
+ __entry->cgroup_ino = cgroup_ino;
+ __entry->nr_reclaimed = nr_reclaimed;
+ ),
+
+ TP_printk("cgroup_ino=%u nr_reclaimed=%lu",
+ __entry->cgroup_ino, __entry->nr_reclaimed)
);
-DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_reclaim_end,
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template,
+ mm_vmscan_memcg_reclaim_end,
- TP_PROTO(unsigned long nr_reclaimed),
+ TP_PROTO(unsigned int cgroup_ino, unsigned long nr_reclaimed),
- TP_ARGS(nr_reclaimed)
+ TP_ARGS(cgroup_ino, nr_reclaimed)
+);
+
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template,
+ mm_vmscan_memcg_softlimit_reclaim_end,
+
+ TP_PROTO(unsigned int cgroup_ino, unsigned long nr_reclaimed),
+
+ TP_ARGS(cgroup_ino, nr_reclaimed)
);
#endif /* CONFIG_MEMCG */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 3d920ff15c80..49a05ba3000d 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -63,6 +63,66 @@ struct posix_msg_tree_node {
int priority;
};
+/*
+ * Locking:
+ *
+ * Accesses to a message queue are synchronized by acquiring info->lock.
+ *
+ * There are two notable exceptions:
+ * - The actual wakeup of a sleeping task is performed using the wake_q
+ * framework. info->lock is already released when wake_up_q is called.
+ * - The exit codepaths after sleeping check ext_wait_queue->state without
+ * any locks. If it is STATE_READY, then the syscall is completed without
+ * acquiring info->lock.
+ *
+ * MQ_BARRIER:
+ * To achieve proper release/acquire memory barrier pairing, the state is set to
+ * STATE_READY with smp_store_release(), and it is read with READ_ONCE followed
+ * by smp_acquire__after_ctrl_dep(). In addition, wake_q_add_safe() is used.
+ *
+ * This prevents the following races:
+ *
+ * 1) With the simple wake_q_add(), the task could be gone already before
+ * the increase of the reference happens
+ * Thread A
+ * Thread B
+ * WRITE_ONCE(wait.state, STATE_NONE);
+ * schedule_hrtimeout()
+ * wake_q_add(A)
+ * if (cmpxchg()) // success
+ * ->state = STATE_READY (reordered)
+ * <timeout returns>
+ * if (wait.state == STATE_READY) return;
+ * sysret to user space
+ * sys_exit()
+ * get_task_struct() // UaF
+ *
+ * Solution: Use wake_q_add_safe() and perform the get_task_struct() before
+ * the smp_store_release() that does ->state = STATE_READY.
+ *
+ * 2) Without proper _release/_acquire barriers, the woken up task
+ * could read stale data
+ *
+ * Thread A
+ * Thread B
+ * do_mq_timedreceive
+ * WRITE_ONCE(wait.state, STATE_NONE);
+ * schedule_hrtimeout()
+ * state = STATE_READY;
+ * <timeout returns>
+ * if (wait.state == STATE_READY) return;
+ * msg_ptr = wait.msg; // Access to stale data!
+ * receiver->msg = message; (reordered)
+ *
+ * Solution: use _release and _acquire barriers.
+ *
+ * 3) There is intentionally no barrier when setting current->state
+ * to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the
+ * release memory barrier, and the wakeup is triggered when holding
+ * info->lock, i.e. spin_lock(&info->lock) provided a pairing
+ * acquire memory barrier.
+ */
+
struct ext_wait_queue { /* queue of sleeping tasks */
struct task_struct *task;
struct list_head list;
@@ -646,18 +706,23 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
wq_add(info, sr, ewp);
for (;;) {
+ /* memory barrier not required, we hold info->lock */
__set_current_state(TASK_INTERRUPTIBLE);
spin_unlock(&info->lock);
time = schedule_hrtimeout_range_clock(timeout, 0,
HRTIMER_MODE_ABS, CLOCK_REALTIME);
- if (ewp->state == STATE_READY) {
+ if (READ_ONCE(ewp->state) == STATE_READY) {
+ /* see MQ_BARRIER for purpose/pairing */
+ smp_acquire__after_ctrl_dep();
retval = 0;
goto out;
}
spin_lock(&info->lock);
- if (ewp->state == STATE_READY) {
+
+ /* we hold info->lock, so no memory barrier required */
+ if (READ_ONCE(ewp->state) == STATE_READY) {
retval = 0;
goto out_unlock;
}
@@ -918,6 +983,18 @@ out_name:
* The same algorithm is used for senders.
*/
+static inline void __pipelined_op(struct wake_q_head *wake_q,
+ struct mqueue_inode_info *info,
+ struct ext_wait_queue *this)
+{
+ list_del(&this->list);
+ get_task_struct(this->task);
+
+ /* see MQ_BARRIER for purpose/pairing */
+ smp_store_release(&this->state, STATE_READY);
+ wake_q_add_safe(wake_q, this->task);
+}
+
/* pipelined_send() - send a message directly to the task waiting in
* sys_mq_timedreceive() (without inserting message into a queue).
*/
@@ -927,17 +1004,7 @@ static inline void pipelined_send(struct wake_q_head *wake_q,
struct ext_wait_queue *receiver)
{
receiver->msg = message;
- list_del(&receiver->list);
- wake_q_add(wake_q, receiver->task);
- /*
- * Rely on the implicit cmpxchg barrier from wake_q_add such
- * that we can ensure that updating receiver->state is the last
- * write operation: As once set, the receiver can continue,
- * and if we don't have the reference count from the wake_q,
- * yet, at that point we can later have a use-after-free
- * condition and bogus wakeup.
- */
- receiver->state = STATE_READY;
+ __pipelined_op(wake_q, info, receiver);
}
/* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
@@ -955,9 +1022,7 @@ static inline void pipelined_receive(struct wake_q_head *wake_q,
if (msg_insert(sender->msg, info))
return;
- list_del(&sender->list);
- wake_q_add(wake_q, sender->task);
- sender->state = STATE_READY;
+ __pipelined_op(wake_q, info, sender);
}
static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
@@ -1044,7 +1109,9 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
} else {
wait.task = current;
wait.msg = (void *) msg_ptr;
- wait.state = STATE_NONE;
+
+ /* memory barrier not required, we hold info->lock */
+ WRITE_ONCE(wait.state, STATE_NONE);
ret = wq_sleep(info, SEND, timeout, &wait);
/*
* wq_sleep must be called with info->lock held, and
@@ -1147,7 +1214,9 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
ret = -EAGAIN;
} else {
wait.task = current;
- wait.state = STATE_NONE;
+
+ /* memory barrier not required, we hold info->lock */
+ WRITE_ONCE(wait.state, STATE_NONE);
ret = wq_sleep(info, RECV, timeout, &wait);
msg_ptr = wait.msg;
}
diff --git a/ipc/msg.c b/ipc/msg.c
index 8dec945fa030..22ed09ded601 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -61,6 +61,16 @@ struct msg_queue {
struct list_head q_senders;
} __randomize_layout;
+/*
+ * MSG_BARRIER Locking:
+ *
+ * Similar to the optimization used in ipc/mqueue.c, one syscall return path
+ * does not acquire any locks when it sees that a message exists in
+ * msg_receiver.r_msg. Therefore r_msg is set using smp_store_release()
+ * and accessed using READ_ONCE()+smp_acquire__after_ctrl_dep(). In addition,
+ * wake_q_add_safe() is used. See ipc/mqueue.c for more details
+ */
+
/* one msg_receiver structure for each sleeping receiver */
struct msg_receiver {
struct list_head r_list;
@@ -184,6 +194,10 @@ static inline void ss_add(struct msg_queue *msq,
{
mss->tsk = current;
mss->msgsz = msgsz;
+ /*
+ * No memory barrier required: we did ipc_lock_object(),
+ * and the waker obtains that lock before calling wake_q_add().
+ */
__set_current_state(TASK_INTERRUPTIBLE);
list_add_tail(&mss->list, &msq->q_senders);
}
@@ -237,8 +251,11 @@ static void expunge_all(struct msg_queue *msq, int res,
struct msg_receiver *msr, *t;
list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
- wake_q_add(wake_q, msr->r_tsk);
- WRITE_ONCE(msr->r_msg, ERR_PTR(res));
+ get_task_struct(msr->r_tsk);
+
+ /* see MSG_BARRIER for purpose/pairing */
+ smp_store_release(&msr->r_msg, ERR_PTR(res));
+ wake_q_add_safe(wake_q, msr->r_tsk);
}
}
@@ -377,7 +394,7 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version)
* NOTE: no locks must be held, the rwsem is taken inside this function.
*/
static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
- struct msqid64_ds *msqid64)
+ struct ipc64_perm *perm, int msg_qbytes)
{
struct kern_ipc_perm *ipcp;
struct msg_queue *msq;
@@ -387,7 +404,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
rcu_read_lock();
ipcp = ipcctl_obtain_check(ns, &msg_ids(ns), msqid, cmd,
- &msqid64->msg_perm, msqid64->msg_qbytes);
+ perm, msg_qbytes);
if (IS_ERR(ipcp)) {
err = PTR_ERR(ipcp);
goto out_unlock1;
@@ -409,18 +426,18 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
{
DEFINE_WAKE_Q(wake_q);
- if (msqid64->msg_qbytes > ns->msg_ctlmnb &&
+ if (msg_qbytes > ns->msg_ctlmnb &&
!capable(CAP_SYS_RESOURCE)) {
err = -EPERM;
goto out_unlock1;
}
ipc_lock_object(&msq->q_perm);
- err = ipc_update_perm(&msqid64->msg_perm, ipcp);
+ err = ipc_update_perm(perm, ipcp);
if (err)
goto out_unlock0;
- msq->q_qbytes = msqid64->msg_qbytes;
+ msq->q_qbytes = msg_qbytes;
msq->q_ctime = ktime_get_real_seconds();
/*
@@ -601,9 +618,9 @@ static long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf, int ver
case IPC_SET:
if (copy_msqid_from_user(&msqid64, buf, version))
return -EFAULT;
- /* fallthru */
+ return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes);
case IPC_RMID:
- return msgctl_down(ns, msqid, cmd, &msqid64);
+ return msgctl_down(ns, msqid, cmd, NULL, 0);
default:
return -EINVAL;
}
@@ -735,9 +752,9 @@ static long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr, int versio
case IPC_SET:
if (copy_compat_msqid_from_user(&msqid64, uptr, version))
return -EFAULT;
- /* fallthru */
+ return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes);
case IPC_RMID:
- return msgctl_down(ns, msqid, cmd, &msqid64);
+ return msgctl_down(ns, msqid, cmd, NULL, 0);
default:
return -EINVAL;
}
@@ -798,13 +815,17 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg,
list_del(&msr->r_list);
if (msr->r_maxsize < msg->m_ts) {
wake_q_add(wake_q, msr->r_tsk);
- WRITE_ONCE(msr->r_msg, ERR_PTR(-E2BIG));
+
+ /* See expunge_all regarding memory barrier */
+ smp_store_release(&msr->r_msg, ERR_PTR(-E2BIG));
} else {
ipc_update_pid(&msq->q_lrpid, task_pid(msr->r_tsk));
msq->q_rtime = ktime_get_real_seconds();
wake_q_add(wake_q, msr->r_tsk);
- WRITE_ONCE(msr->r_msg, msg);
+
+ /* See expunge_all regarding memory barrier */
+ smp_store_release(&msr->r_msg, msg);
return 1;
}
}
@@ -1154,7 +1175,11 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in
msr_d.r_maxsize = INT_MAX;
else
msr_d.r_maxsize = bufsz;
- msr_d.r_msg = ERR_PTR(-EAGAIN);
+
+ /* memory barrier not require due to ipc_lock_object() */
+ WRITE_ONCE(msr_d.r_msg, ERR_PTR(-EAGAIN));
+
+ /* memory barrier not required, we own ipc_lock_object() */
__set_current_state(TASK_INTERRUPTIBLE);
ipc_unlock_object(&msq->q_perm);
@@ -1183,8 +1208,12 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in
* signal) it will either see the message and continue ...
*/
msg = READ_ONCE(msr_d.r_msg);
- if (msg != ERR_PTR(-EAGAIN))
+ if (msg != ERR_PTR(-EAGAIN)) {
+ /* see MSG_BARRIER for purpose/pairing */
+ smp_acquire__after_ctrl_dep();
+
goto out_unlock1;
+ }
/*
* ... or see -EAGAIN, acquire the lock to check the message
@@ -1192,7 +1221,7 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in
*/
ipc_lock_object(&msq->q_perm);
- msg = msr_d.r_msg;
+ msg = READ_ONCE(msr_d.r_msg);
if (msg != ERR_PTR(-EAGAIN))
goto out_unlock0;
diff --git a/ipc/sem.c b/ipc/sem.c
index ec97a7072413..4f4303f32077 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -205,15 +205,38 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
*
* Memory ordering:
* Most ordering is enforced by using spin_lock() and spin_unlock().
- * The special case is use_global_lock:
+ *
+ * Exceptions:
+ * 1) use_global_lock: (SEM_BARRIER_1)
* Setting it from non-zero to 0 is a RELEASE, this is ensured by
- * using smp_store_release().
+ * using smp_store_release(): Immediately after setting it to 0,
+ * a simple op can start.
* Testing if it is non-zero is an ACQUIRE, this is ensured by using
* smp_load_acquire().
* Setting it from 0 to non-zero must be ordered with regards to
* this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
* is inside a spin_lock() and after a write from 0 to non-zero a
* spin_lock()+spin_unlock() is done.
+ *
+ * 2) queue.status: (SEM_BARRIER_2)
+ * Initialization is done while holding sem_lock(), so no further barrier is
+ * required.
+ * Setting it to a result code is a RELEASE, this is ensured by both a
+ * smp_store_release() (for case a) and while holding sem_lock()
+ * (for case b).
+ * The AQUIRE when reading the result code without holding sem_lock() is
+ * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep().
+ * (case a above).
+ * Reading the result code while holding sem_lock() needs no further barriers,
+ * the locks inside sem_lock() enforce ordering (case b above)
+ *
+ * 3) current->state:
+ * current->state is set to TASK_INTERRUPTIBLE while holding sem_lock().
+ * The wakeup is handled using the wake_q infrastructure. wake_q wakeups may
+ * happen immediately after calling wake_q_add. As wake_q_add_safe() is called
+ * when holding sem_lock(), no further barriers are required.
+ *
+ * See also ipc/mqueue.c for more details on the covered races.
*/
#define sc_semmsl sem_ctls[0]
@@ -344,12 +367,8 @@ static void complexmode_tryleave(struct sem_array *sma)
return;
}
if (sma->use_global_lock == 1) {
- /*
- * Immediately after setting use_global_lock to 0,
- * a simple op can start. Thus: all memory writes
- * performed by the current operation must be visible
- * before we set use_global_lock to 0.
- */
+
+ /* See SEM_BARRIER_1 for purpose/pairing */
smp_store_release(&sma->use_global_lock, 0);
} else {
sma->use_global_lock--;
@@ -400,7 +419,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
*/
spin_lock(&sem->lock);
- /* pairs with smp_store_release() */
+ /* see SEM_BARRIER_1 for purpose/pairing */
if (!smp_load_acquire(&sma->use_global_lock)) {
/* fast path successful! */
return sops->sem_num;
@@ -766,15 +785,12 @@ would_block:
static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
struct wake_q_head *wake_q)
{
- wake_q_add(wake_q, q->sleeper);
- /*
- * Rely on the above implicit barrier, such that we can
- * ensure that we hold reference to the task before setting
- * q->status. Otherwise we could race with do_exit if the
- * task is awoken by an external event before calling
- * wake_up_process().
- */
- WRITE_ONCE(q->status, error);
+ get_task_struct(q->sleeper);
+
+ /* see SEM_BARRIER_2 for purpuse/pairing */
+ smp_store_release(&q->status, error);
+
+ wake_q_add_safe(wake_q, q->sleeper);
}
static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
@@ -2148,9 +2164,11 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops,
}
do {
+ /* memory ordering ensured by the lock in sem_lock() */
WRITE_ONCE(queue.status, -EINTR);
queue.sleeper = current;
+ /* memory ordering is ensured by the lock in sem_lock() */
__set_current_state(TASK_INTERRUPTIBLE);
sem_unlock(sma, locknum);
rcu_read_unlock();
@@ -2173,13 +2191,8 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops,
*/
error = READ_ONCE(queue.status);
if (error != -EINTR) {
- /*
- * User space could assume that semop() is a memory
- * barrier: Without the mb(), the cpu could
- * speculatively read in userspace stale data that was
- * overwritten by the previous owner of the semaphore.
- */
- smp_mb();
+ /* see SEM_BARRIER_2 for purpose/pairing */
+ smp_acquire__after_ctrl_dep();
goto out_free;
}
@@ -2189,6 +2202,9 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops,
if (!ipc_valid_object(&sma->sem_perm))
goto out_unlock_free;
+ /*
+ * No necessity for any barrier: We are protect by sem_lock()
+ */
error = READ_ONCE(queue.status);
/*
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c74761004ee5..ece7e13f6e4a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1457,7 +1457,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
/* Try to map as high as possible, this is only a hint. */
area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
PAGE_SIZE, 0, 0);
- if (area->vaddr & ~PAGE_MASK) {
+ if (IS_ERR_VALUE(area->vaddr)) {
ret = area->vaddr;
goto fail;
}
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 14a625c16cb3..f1443abb31f1 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -49,6 +49,7 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_
unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
int __read_mostly sysctl_hung_task_warnings = 10;
+int __read_mostly sysctl_hung_task_interval_warnings;
static int __read_mostly did_panic;
static bool hung_task_show_lock;
@@ -85,6 +86,34 @@ static struct notifier_block panic_block = {
.notifier_call = hung_task_panic,
};
+static void hung_task_warning(struct task_struct *t, bool timeout)
+{
+ const char *loglevel = timeout ? KERN_ERR : KERN_INFO;
+ const char *path;
+ int *warnings;
+
+ if (timeout) {
+ warnings = &sysctl_hung_task_warnings;
+ path = "hung_task_timeout_secs";
+ } else {
+ warnings = &sysctl_hung_task_interval_warnings;
+ path = "hung_task_interval_secs";
+ }
+
+ if (*warnings > 0)
+ --*warnings;
+
+ printk("%sINFO: task %s:%d blocked for more than %ld seconds.\n",
+ loglevel, t->comm, t->pid, (jiffies - t->last_switch_time) / HZ);
+ printk("%s %s %s %.*s\n",
+ loglevel, print_tainted(), init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+ printk("%s\"echo 0 > /proc/sys/kernel/%s\" disables this message.\n",
+ loglevel, path);
+ sched_show_task(t);
+}
+
static void check_hung_task(struct task_struct *t, unsigned long timeout)
{
unsigned long switch_count = t->nvcsw + t->nivcsw;
@@ -109,6 +138,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
t->last_switch_time = jiffies;
return;
}
+ if (sysctl_hung_task_interval_warnings)
+ hung_task_warning(t, false);
+
if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
return;
@@ -120,28 +152,57 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
hung_task_call_panic = true;
}
- /*
- * Ok, the task did not get scheduled for more than 2 minutes,
- * complain:
- */
if (sysctl_hung_task_warnings) {
- if (sysctl_hung_task_warnings > 0)
- sysctl_hung_task_warnings--;
- pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
- t->comm, t->pid, (jiffies - t->last_switch_time) / HZ);
- pr_err(" %s %s %.*s\n",
- print_tainted(), init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
- pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
- " disables this message.\n");
- sched_show_task(t);
+ /* Don't print warings twice */
+ if (!sysctl_hung_task_interval_warnings)
+ hung_task_warning(t, true);
hung_task_show_lock = true;
}
touch_nmi_watchdog();
}
+static void check_killed_task(struct task_struct *t, unsigned long timeout)
+{
+ unsigned long stamp = t->killed_time;
+
+ /*
+ * Ensure the task is not frozen.
+ * Also, skip vfork and any other user process that freezer should skip.
+ */
+ if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
+ return;
+ /*
+ * Skip threads which are already inside do_exit(), for exit_mm() etc.
+ * might take many seconds.
+ */
+ if (t->flags & PF_EXITING)
+ return;
+ if (!stamp) {
+ stamp = jiffies;
+ if (!stamp)
+ stamp++;
+ t->killed_time = stamp;
+ return;
+ }
+ if (time_is_after_jiffies(stamp + timeout * HZ))
+ return;
+ trace_sched_process_hang(t);
+ if (sysctl_hung_task_panic) {
+ console_verbose();
+ hung_task_call_panic = true;
+ }
+ /*
+ * This thread failed to terminate for more than
+ * sysctl_hung_task_timeout_secs seconds, complain:
+ */
+ pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n",
+ t->comm, t->pid, (jiffies - stamp) / HZ);
+ sched_show_task(t);
+ hung_task_show_lock = true;
+ touch_nmi_watchdog();
+}
+
/*
* To avoid extending the RCU grace period for an unbounded amount of time,
* periodically exit the critical section and enter a new one.
@@ -193,6 +254,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
goto unlock;
last_break = jiffies;
}
+ /* Check threads which are about to terminate. */
+ if (unlikely(fatal_signal_pending(t)))
+ check_killed_task(t, timeout);
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index d9f5081d578d..63d7501ac638 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -23,22 +23,10 @@ static int notifier_chain_register(struct notifier_block **nl,
struct notifier_block *n)
{
while ((*nl) != NULL) {
- WARN_ONCE(((*nl) == n), "double register detected");
- if (n->priority > (*nl)->priority)
- break;
- nl = &((*nl)->next);
- }
- n->next = *nl;
- rcu_assign_pointer(*nl, n);
- return 0;
-}
-
-static int notifier_chain_cond_register(struct notifier_block **nl,
- struct notifier_block *n)
-{
- while ((*nl) != NULL) {
- if ((*nl) == n)
+ if (unlikely((*nl) == n)) {
+ WARN(1, "double register detected");
return 0;
+ }
if (n->priority > (*nl)->priority)
break;
nl = &((*nl)->next);
@@ -233,29 +221,6 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
/**
- * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain
- * @nh: Pointer to head of the blocking notifier chain
- * @n: New entry in notifier chain
- *
- * Adds a notifier to a blocking notifier chain, only if not already
- * present in the chain.
- * Must be called in process context.
- *
- * Currently always returns zero.
- */
-int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh,
- struct notifier_block *n)
-{
- int ret;
-
- down_write(&nh->rwsem);
- ret = notifier_chain_cond_register(&nh->head, n);
- up_write(&nh->rwsem);
- return ret;
-}
-EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register);
-
-/**
* blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @n: Entry to remove from notifier chain
diff --git a/kernel/profile.c b/kernel/profile.c
index af7c94bf5fa1..4b144b02ca5d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -336,7 +336,7 @@ static int profile_dead_cpu(unsigned int cpu)
struct page *page;
int i;
- if (prof_cpu_mask != NULL)
+ if (cpumask_available(prof_cpu_mask))
cpumask_clear_cpu(cpu, prof_cpu_mask);
for (i = 0; i < 2; i++) {
@@ -373,7 +373,7 @@ static int profile_prepare_cpu(unsigned int cpu)
static int profile_online_cpu(unsigned int cpu)
{
- if (prof_cpu_mask != NULL)
+ if (cpumask_available(prof_cpu_mask))
cpumask_set_cpu(cpu, prof_cpu_mask);
return 0;
@@ -403,7 +403,7 @@ void profile_tick(int type)
{
struct pt_regs *regs = get_irq_regs();
- if (!user_mode(regs) && prof_cpu_mask != NULL &&
+ if (!user_mode(regs) && cpumask_available(prof_cpu_mask) &&
cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
profile_hit(type, (void *)profile_pc(regs));
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b6f2f35d0bcf..31ece1120aa4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1148,6 +1148,14 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &neg_one,
},
+ {
+ .procname = "hung_task_interval_warnings",
+ .data = &sysctl_hung_task_interval_warnings,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &neg_one,
+ },
#endif
#ifdef CONFIG_RT_MUTEXES
{
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 27ff08ce03e9..83a804d58e2f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2147,6 +2147,12 @@ config IO_STRICT_DEVMEM
If in doubt, say Y.
+config DEBUG_AID_FOR_SYZBOT
+ bool "Additional debug code for syzbot"
+ default n
+ help
+ This option is intended for testing by syzbot.
+
source "arch/$(SRCARCH)/Kconfig.debug"
endmenu # Kernel hacking
diff --git a/lib/math/rational.c b/lib/math/rational.c
index ba7443677c90..31fb27db2deb 100644
--- a/lib/math/rational.c
+++ b/lib/math/rational.c
@@ -3,6 +3,7 @@
* rational fractions
*
* Copyright (C) 2009 emlix GmbH, Oskar Schirmer <oskar@scara.com>
+ * Copyright (C) 2019 Trent Piepho <tpiepho@gmail.com>
*
* helper functions when coping with rational numbers
*/
@@ -10,6 +11,7 @@
#include <linux/rational.h>
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/kernel.h>
/*
* calculate best rational approximation for a given fraction
@@ -33,30 +35,65 @@ void rational_best_approximation(
unsigned long max_numerator, unsigned long max_denominator,
unsigned long *best_numerator, unsigned long *best_denominator)
{
- unsigned long n, d, n0, d0, n1, d1;
+ /* n/d is the starting rational, which is continually
+ * decreased each iteration using the Euclidean algorithm.
+ *
+ * dp is the value of d from the prior iteration.
+ *
+ * n2/d2, n1/d1, and n0/d0 are our successively more accurate
+ * approximations of the rational. They are, respectively,
+ * the current, previous, and two prior iterations of it.
+ *
+ * a is current term of the continued fraction.
+ */
+ unsigned long n, d, n0, d0, n1, d1, n2, d2;
n = given_numerator;
d = given_denominator;
n0 = d1 = 0;
n1 = d0 = 1;
+
for (;;) {
- unsigned long t, a;
- if ((n1 > max_numerator) || (d1 > max_denominator)) {
- n1 = n0;
- d1 = d0;
- break;
- }
+ unsigned long dp, a;
+
if (d == 0)
break;
- t = d;
+ /* Find next term in continued fraction, 'a', via
+ * Euclidean algorithm.
+ */
+ dp = d;
a = n / d;
d = n % d;
- n = t;
- t = n0 + a * n1;
+ n = dp;
+
+ /* Calculate the current rational approximation (aka
+ * convergent), n2/d2, using the term just found and
+ * the two prior approximations.
+ */
+ n2 = n0 + a * n1;
+ d2 = d0 + a * d1;
+
+ /* If the current convergent exceeds the maxes, then
+ * return either the previous convergent or the
+ * largest semi-convergent, the final term of which is
+ * found below as 't'.
+ */
+ if ((n2 > max_numerator) || (d2 > max_denominator)) {
+ unsigned long t = min((max_numerator - n0) / n1,
+ (max_denominator - d0) / d1);
+
+ /* This tests if the semi-convergent is closer
+ * than the previous convergent.
+ */
+ if (2u * t > a || (2u * t == a && d0 * dp > d1 * d)) {
+ n1 = n0 + t * n1;
+ d1 = d0 + t * d1;
+ }
+ break;
+ }
n0 = n1;
- n1 = t;
- t = d0 + a * d1;
+ n1 = n2;
d0 = d1;
- d1 = t;
+ d1 = d2;
}
*best_numerator = n1;
*best_denominator = d1;
diff --git a/lib/ubsan.c b/lib/ubsan.c
index 0c4681118fcd..f007a406f89c 100644
--- a/lib/ubsan.c
+++ b/lib/ubsan.c
@@ -140,25 +140,21 @@ static void val_to_string(char *str, size_t size, struct type_descriptor *type,
}
}
-static DEFINE_SPINLOCK(report_lock);
-
-static void ubsan_prologue(struct source_location *location,
- unsigned long *flags)
+static void ubsan_prologue(struct source_location *location)
{
current->in_ubsan++;
- spin_lock_irqsave(&report_lock, *flags);
pr_err("========================================"
"========================================\n");
print_source_location("UBSAN: Undefined behaviour in", location);
}
-static void ubsan_epilogue(unsigned long *flags)
+static void ubsan_epilogue(void)
{
dump_stack();
pr_err("========================================"
"========================================\n");
- spin_unlock_irqrestore(&report_lock, *flags);
+
current->in_ubsan--;
}
@@ -167,14 +163,13 @@ static void handle_overflow(struct overflow_data *data, void *lhs,
{
struct type_descriptor *type = data->type;
- unsigned long flags;
char lhs_val_str[VALUE_LENGTH];
char rhs_val_str[VALUE_LENGTH];
if (suppress_report(&data->location))
return;
- ubsan_prologue(&data->location, &flags);
+ ubsan_prologue(&data->location);
val_to_string(lhs_val_str, sizeof(lhs_val_str), type, lhs);
val_to_string(rhs_val_str, sizeof(rhs_val_str), type, rhs);
@@ -186,7 +181,7 @@ static void handle_overflow(struct overflow_data *data, void *lhs,
rhs_val_str,
type->type_name);
- ubsan_epilogue(&flags);
+ ubsan_epilogue();
}
void __ubsan_handle_add_overflow(struct overflow_data *data,
@@ -214,20 +209,19 @@ EXPORT_SYMBOL(__ubsan_handle_mul_overflow);
void __ubsan_handle_negate_overflow(struct overflow_data *data,
void *old_val)
{
- unsigned long flags;
char old_val_str[VALUE_LENGTH];
if (suppress_report(&data->location))
return;
- ubsan_prologue(&data->location, &flags);
+ ubsan_prologue(&data->location);
val_to_string(old_val_str, sizeof(old_val_str), data->type, old_val);
pr_err("negation of %s cannot be represented in type %s:\n",
old_val_str, data->type->type_name);
- ubsan_epilogue(&flags);
+ ubsan_epilogue();
}
EXPORT_SYMBOL(__ubsan_handle_negate_overflow);
@@ -235,13 +229,12 @@ EXPORT_SYMBOL(__ubsan_handle_negate_overflow);
void __ubsan_handle_divrem_overflow(struct overflow_data *data,
void *lhs, void *rhs)
{
- unsigned long flags;
char rhs_val_str[VALUE_LENGTH];
if (suppress_report(&data->location))
return;
- ubsan_prologue(&data->location, &flags);
+ ubsan_prologue(&data->location);
val_to_string(rhs_val_str, sizeof(rhs_val_str), data->type, rhs);
@@ -251,58 +244,52 @@ void __ubsan_handle_divrem_overflow(struct overflow_data *data,
else
pr_err("division by zero\n");
- ubsan_epilogue(&flags);
+ ubsan_epilogue();
}
EXPORT_SYMBOL(__ubsan_handle_divrem_overflow);
static void handle_null_ptr_deref(struct type_mismatch_data_common *data)
{
- unsigned long flags;
-
if (suppress_report(data->location))
return;
- ubsan_prologue(data->location, &flags);
+ ubsan_prologue(data->location);
pr_err("%s null pointer of type %s\n",
type_check_kinds[data->type_check_kind],
data->type->type_name);
- ubsan_epilogue(&flags);
+ ubsan_epilogue();
}
static void handle_misaligned_access(struct type_mismatch_data_common *data,
unsigned long ptr)
{
- unsigned long flags;
-
if (suppress_report(data->location))
return;
- ubsan_prologue(data->location, &flags);
+ ubsan_prologue(data->location);
pr_err("%s misaligned address %p for type %s\n",
type_check_kinds[data->type_check_kind],
(void *)ptr, data->type->type_name);
pr_err("which requires %ld byte alignment\n", data->alignment);
- ubsan_epilogue(&flags);
+ ubsan_epilogue();
}
static void handle_object_size_mismatch(struct type_mismatch_data_common *data,
unsigned long ptr)
{
- unsigned long flags;
-
if (suppress_report(data->location))
return;
- ubsan_prologue(data->location, &flags);
+ ubsan_prologue(data->location);
pr_err("%s address %p with insufficient space\n",
type_check_kinds[data->type_check_kind],
(void *) ptr);
pr_err("for an object of type %s\n", data->type->type_name);
- ubsan_epilogue(&flags);
+ ubsan_epilogue();
}
static void ubsan_type_mismatch_common(struct type_mismatch_data_common *data,
@@ -351,25 +338,23 @@ EXPORT_SYMBOL(__ubsan_handle_type_mismatch_v1);
void __ubsan_handle_out_of_bounds(struct out_of_bounds_data *data, void *index)
{
- unsigned long flags;
char index_str[VALUE_LENGTH];
if (suppress_report(&data->location))
return;
- ubsan_prologue(&data->location, &flags);
+ ubsan_prologue(&data->location);
val_to_string(index_str, sizeof(index_str), data->index_type, index);
pr_err("index %s is out of range for type %s\n", index_str,
data->array_type->type_name);
- ubsan_epilogue(&flags);
+ ubsan_epilogue();
}
EXPORT_SYMBOL(__ubsan_handle_out_of_bounds);
void __ubsan_handle_shift_out_of_bounds(struct shift_out_of_bounds_data *data,
void *lhs, void *rhs)
{
- unsigned long flags;
struct type_descriptor *rhs_type = data->rhs_type;
struct type_descriptor *lhs_type = data->lhs_type;
char rhs_str[VALUE_LENGTH];
@@ -379,7 +364,7 @@ void __ubsan_handle_shift_out_of_bounds(struct shift_out_of_bounds_data *data,
if (suppress_report(&data->location))
goto out;
- ubsan_prologue(&data->location, &flags);
+ ubsan_prologue(&data->location);
val_to_string(rhs_str, sizeof(rhs_str), rhs_type, rhs);
val_to_string(lhs_str, sizeof(lhs_str), lhs_type, lhs);
@@ -402,7 +387,7 @@ void __ubsan_handle_shift_out_of_bounds(struct shift_out_of_bounds_data *data,
lhs_str, rhs_str,
lhs_type->type_name);
- ubsan_epilogue(&flags);
+ ubsan_epilogue();
out:
user_access_restore(ua_flags);
}
@@ -411,11 +396,9 @@ EXPORT_SYMBOL(__ubsan_handle_shift_out_of_bounds);
void __ubsan_handle_builtin_unreachable(struct unreachable_data *data)
{
- unsigned long flags;
-
- ubsan_prologue(&data->location, &flags);
+ ubsan_prologue(&data->location);
pr_err("calling __builtin_unreachable()\n");
- ubsan_epilogue(&flags);
+ ubsan_epilogue();
panic("can't return from __builtin_unreachable()");
}
EXPORT_SYMBOL(__ubsan_handle_builtin_unreachable);
@@ -423,19 +406,18 @@ EXPORT_SYMBOL(__ubsan_handle_builtin_unreachable);
void __ubsan_handle_load_invalid_value(struct invalid_value_data *data,
void *val)
{
- unsigned long flags;
char val_str[VALUE_LENGTH];
if (suppress_report(&data->location))
return;
- ubsan_prologue(&data->location, &flags);
+ ubsan_prologue(&data->location);
val_to_string(val_str, sizeof(val_str), data->type, val);
pr_err("load of value %s is not a valid value for type %s\n",
val_str, data->type->type_name);
- ubsan_epilogue(&flags);
+ ubsan_epilogue();
}
EXPORT_SYMBOL(__ubsan_handle_load_invalid_value);
diff --git a/mm/cma.c b/mm/cma.c
index 7fe0b8356775..be55d1988c67 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -95,13 +95,11 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
static int __init cma_activate_area(struct cma *cma)
{
- int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long);
unsigned long base_pfn = cma->base_pfn, pfn = base_pfn;
unsigned i = cma->count >> pageblock_order;
struct zone *zone;
- cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
-
+ cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL);
if (!cma->bitmap) {
cma->count = 0;
return -ENOMEM;
@@ -139,7 +137,7 @@ static int __init cma_activate_area(struct cma *cma)
not_in_zone:
pr_err("CMA area %s could not be activated\n", cma->name);
- kfree(cma->bitmap);
+ bitmap_free(cma->bitmap);
cma->count = 0;
return -EINVAL;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 85b7d087eb45..1f5731768222 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2329,27 +2329,6 @@ EXPORT_SYMBOL(generic_file_read_iter);
#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS (100)
-static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
- struct file *fpin)
-{
- int flags = vmf->flags;
-
- if (fpin)
- return fpin;
-
- /*
- * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
- * anything, so we only pin the file and drop the mmap_sem if only
- * FAULT_FLAG_ALLOW_RETRY is set.
- */
- if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
- FAULT_FLAG_ALLOW_RETRY) {
- fpin = get_file(vmf->vma->vm_file);
- up_read(&vmf->vma->vm_mm->mmap_sem);
- }
- return fpin;
-}
-
/*
* lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
* @vmf - the vm_fault for this fault.
diff --git a/mm/gup.c b/mm/gup.c
index 8f236a335ae9..c2b3e117d706 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1443,6 +1443,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
bool drain_allow = true;
bool migrate_allow = true;
LIST_HEAD(cma_page_list);
+ long ret = nr_pages;
check_again:
for (i = 0; i < nr_pages;) {
@@ -1504,17 +1505,18 @@ check_again:
* again migrating any new CMA pages which we failed to isolate
* earlier.
*/
- nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
+ ret = __get_user_pages_locked(tsk, mm, start, nr_pages,
pages, vmas, NULL,
gup_flags);
- if ((nr_pages > 0) && migrate_allow) {
+ if ((ret > 0) && migrate_allow) {
+ nr_pages = ret;
drain_allow = true;
goto check_again;
}
}
- return nr_pages;
+ return ret;
}
#else
static long check_and_migrate_cma_pages(struct task_struct *tsk,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 13cc93785006..b68c13e564d1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -686,20 +686,18 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
}
/* Caller must hold page table lock. */
-static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
{
pmd_t entry;
- if (!pmd_none(*pmd))
- return false;
+
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
if (pgtable)
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
- return true;
}
vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b45a95363a84..a5c2c880af27 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -244,16 +244,66 @@ struct file_region {
long to;
};
+/* Must be called with resv->lock held. Calling this with count_only == true
+ * will count the number of pages to be added but will not modify the linked
+ * list.
+ */
+static long add_reservation_in_range(struct resv_map *resv, long f, long t,
+ bool count_only)
+{
+ long chg = 0;
+ struct list_head *head = &resv->regions;
+ struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
+
+ /* Locate the region we are before or in. */
+ list_for_each_entry(rg, head, link)
+ if (f <= rg->to)
+ break;
+
+ /* Round our left edge to the current segment if it encloses us. */
+ if (f > rg->from)
+ f = rg->from;
+
+ chg = t - f;
+
+ /* Check for and consume any regions we now overlap with. */
+ nrg = rg;
+ list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ if (rg->from > t)
+ break;
+
+ /* We overlap with this area, if it extends further than
+ * us then we must extend ourselves. Account for its
+ * existing reservation.
+ */
+ if (rg->to > t) {
+ chg += rg->to - t;
+ t = rg->to;
+ }
+ chg -= rg->to - rg->from;
+
+ if (!count_only && rg != nrg) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ }
+
+ if (!count_only) {
+ nrg->from = f;
+ nrg->to = t;
+ }
+
+ return chg;
+}
+
/*
* Add the huge page range represented by [f, t) to the reserve
- * map. In the normal case, existing regions will be expanded
- * to accommodate the specified range. Sufficient regions should
- * exist for expansion due to the previous call to region_chg
- * with the same range. However, it is possible that region_del
- * could have been called after region_chg and modifed the map
- * in such a way that no region exists to be expanded. In this
- * case, pull a region descriptor from the cache associated with
- * the map and use that for the new range.
+ * map. Existing regions will be expanded to accommodate the specified
+ * range, or a region will be taken from the cache. Sufficient regions
+ * must exist in the cache due to the previous call to region_chg with
+ * the same range.
*
* Return the number of new huge pages added to the map. This
* number is greater than or equal to zero.
@@ -261,7 +311,7 @@ struct file_region {
static long region_add(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
- struct file_region *rg, *nrg, *trg;
+ struct file_region *rg, *nrg;
long add = 0;
spin_lock(&resv->lock);
@@ -272,9 +322,8 @@ static long region_add(struct resv_map *resv, long f, long t)
/*
* If no region exists which can be expanded to include the
- * specified range, the list must have been modified by an
- * interleving call to region_del(). Pull a region descriptor
- * from the cache and use it for this range.
+ * specified range, pull a region descriptor from the cache
+ * and use it for this range.
*/
if (&rg->link == head || t < rg->from) {
VM_BUG_ON(resv->region_cache_count <= 0);
@@ -292,38 +341,7 @@ static long region_add(struct resv_map *resv, long f, long t)
goto out_locked;
}
- /* Round our left edge to the current segment if it encloses us. */
- if (f > rg->from)
- f = rg->from;
-
- /* Check for and consume any regions we now overlap with. */
- nrg = rg;
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- if (rg->from > t)
- break;
-
- /* If this area reaches higher then extend our area to
- * include it completely. If this is not the first area
- * which we intend to reuse, free it. */
- if (rg->to > t)
- t = rg->to;
- if (rg != nrg) {
- /* Decrement return value by the deleted range.
- * Another range will span this area so that by
- * end of routine add will be >= zero
- */
- add -= (rg->to - rg->from);
- list_del(&rg->link);
- kfree(rg);
- }
- }
-
- add += (nrg->from - f); /* Added to beginning of region */
- nrg->from = f;
- add += t - nrg->to; /* Added to end of region */
- nrg->to = t;
+ add = add_reservation_in_range(resv, f, t, false);
out_locked:
resv->adds_in_progress--;
@@ -339,15 +357,9 @@ out_locked:
* call to region_add that will actually modify the reserve
* map to add the specified range [f, t). region_chg does
* not change the number of huge pages represented by the
- * map. However, if the existing regions in the map can not
- * be expanded to represent the new range, a new file_region
- * structure is added to the map as a placeholder. This is
- * so that the subsequent region_add call will have all the
- * regions it needs and will not fail.
- *
- * Upon entry, region_chg will also examine the cache of region descriptors
- * associated with the map. If there are not enough descriptors cached, one
- * will be allocated for the in progress add operation.
+ * map. A new file_region structure is added to the cache
+ * as a placeholder, so that the subsequent region_add
+ * call will have all the regions it needs and will not fail.
*
* Returns the number of huge pages that need to be added to the existing
* reservation map for the range [f, t). This number is greater or equal to
@@ -356,11 +368,8 @@ out_locked:
*/
static long region_chg(struct resv_map *resv, long f, long t)
{
- struct list_head *head = &resv->regions;
- struct file_region *rg, *nrg = NULL;
long chg = 0;
-retry:
spin_lock(&resv->lock);
retry_locked:
resv->adds_in_progress++;
@@ -378,10 +387,8 @@ retry_locked:
spin_unlock(&resv->lock);
trg = kmalloc(sizeof(*trg), GFP_KERNEL);
- if (!trg) {
- kfree(nrg);
+ if (!trg)
return -ENOMEM;
- }
spin_lock(&resv->lock);
list_add(&trg->link, &resv->region_cache);
@@ -389,61 +396,8 @@ retry_locked:
goto retry_locked;
}
- /* Locate the region we are before or in. */
- list_for_each_entry(rg, head, link)
- if (f <= rg->to)
- break;
+ chg = add_reservation_in_range(resv, f, t, true);
- /* If we are below the current region then a new region is required.
- * Subtle, allocate a new region at the position but make it zero
- * size such that we can guarantee to record the reservation. */
- if (&rg->link == head || t < rg->from) {
- if (!nrg) {
- resv->adds_in_progress--;
- spin_unlock(&resv->lock);
- nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
- if (!nrg)
- return -ENOMEM;
-
- nrg->from = f;
- nrg->to = f;
- INIT_LIST_HEAD(&nrg->link);
- goto retry;
- }
-
- list_add(&nrg->link, rg->link.prev);
- chg = t - f;
- goto out_nrg;
- }
-
- /* Round our left edge to the current segment if it encloses us. */
- if (f > rg->from)
- f = rg->from;
- chg = t - f;
-
- /* Check for and consume any regions we now overlap with. */
- list_for_each_entry(rg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- if (rg->from > t)
- goto out;
-
- /* We overlap with this area, if it extends further than
- * us then we must extend ourselves. Account for its
- * existing reservation. */
- if (rg->to > t) {
- chg += rg->to - t;
- t = rg->to;
- }
- chg -= rg->to - rg->from;
- }
-
-out:
- spin_unlock(&resv->lock);
- /* We already know we raced and no longer need the new region */
- kfree(nrg);
- return chg;
-out_nrg:
spin_unlock(&resv->lock);
return chg;
}
@@ -1069,85 +1023,12 @@ static void free_gigantic_page(struct page *page, unsigned int order)
}
#ifdef CONFIG_CONTIG_ALLOC
-static int __alloc_gigantic_page(unsigned long start_pfn,
- unsigned long nr_pages, gfp_t gfp_mask)
-{
- unsigned long end_pfn = start_pfn + nr_pages;
- return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
- gfp_mask);
-}
-
-static bool pfn_range_valid_gigantic(struct zone *z,
- unsigned long start_pfn, unsigned long nr_pages)
-{
- unsigned long i, end_pfn = start_pfn + nr_pages;
- struct page *page;
-
- for (i = start_pfn; i < end_pfn; i++) {
- page = pfn_to_online_page(i);
- if (!page)
- return false;
-
- if (page_zone(page) != z)
- return false;
-
- if (PageReserved(page))
- return false;
-
- if (page_count(page) > 0)
- return false;
-
- if (PageHuge(page))
- return false;
- }
-
- return true;
-}
-
-static bool zone_spans_last_pfn(const struct zone *zone,
- unsigned long start_pfn, unsigned long nr_pages)
-{
- unsigned long last_pfn = start_pfn + nr_pages - 1;
- return zone_spans_pfn(zone, last_pfn);
-}
-
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
- unsigned int order = huge_page_order(h);
- unsigned long nr_pages = 1 << order;
- unsigned long ret, pfn, flags;
- struct zonelist *zonelist;
- struct zone *zone;
- struct zoneref *z;
-
- zonelist = node_zonelist(nid, gfp_mask);
- for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
- spin_lock_irqsave(&zone->lock, flags);
-
- pfn = ALIGN(zone->zone_start_pfn, nr_pages);
- while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
- if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
- /*
- * We release the zone lock here because
- * alloc_contig_range() will also lock the zone
- * at some point. If there's an allocation
- * spinning on this lock, it may win the race
- * and cause alloc_contig_range() to fail...
- */
- spin_unlock_irqrestore(&zone->lock, flags);
- ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
- if (!ret)
- return pfn_to_page(pfn);
- spin_lock_irqsave(&zone->lock, flags);
- }
- pfn += nr_pages;
- }
-
- spin_unlock_irqrestore(&zone->lock, flags);
- }
+ unsigned long nr_pages = 1UL << huge_page_order(h);
- return NULL;
+ return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
@@ -3915,7 +3796,7 @@ retry:
* handling userfault. Reacquire after handling
* fault to make calling code simpler.
*/
- hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -4042,8 +3923,7 @@ backout_unlocked:
}
#ifdef CONFIG_SMP
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
- pgoff_t idx, unsigned long address)
+u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
unsigned long key[2];
u32 hash;
@@ -4051,7 +3931,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
key[0] = (unsigned long) mapping;
key[1] = idx;
- hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
+ hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
return hash & (num_fault_mutexes - 1);
}
@@ -4060,8 +3940,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
- pgoff_t idx, unsigned long address)
+u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
return 0;
}
@@ -4105,7 +3984,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
@@ -4459,6 +4338,21 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
}
+
+ /*
+ * If subpage information not requested, update counters
+ * and skip the same_page loop below.
+ */
+ if (!pages && !vmas && !pfn_offset &&
+ (vaddr + huge_page_size(h) < vma->vm_end) &&
+ (remainder >= pages_per_huge_page(h))) {
+ vaddr += huge_page_size(h);
+ remainder -= pages_per_huge_page(h);
+ i += pages_per_huge_page(h);
+ spin_unlock(ptl);
+ continue;
+ }
+
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
diff --git a/mm/internal.h b/mm/internal.h
index 0d5f720c75ab..3cf20ab3ca01 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -165,6 +165,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
extern int user_min_free_kbytes;
+extern void zone_pcp_update(struct zone *zone);
+extern void zone_pcp_reset(struct zone *zone);
+
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
@@ -290,7 +293,8 @@ static inline bool is_data_mapping(vm_flags_t flags)
/* mm/util.c */
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node *rb_parent);
+ struct vm_area_struct *prev);
+void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
#ifdef CONFIG_MMU
extern long populate_vma_page_range(struct vm_area_struct *vma,
@@ -362,6 +366,27 @@ vma_address(struct page *page, struct vm_area_struct *vma)
return max(start, vma->vm_start);
}
+static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
+ struct file *fpin)
+{
+ int flags = vmf->flags;
+
+ if (fpin)
+ return fpin;
+
+ /*
+ * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
+ * anything, so we only pin the file and drop the mmap_sem if only
+ * FAULT_FLAG_ALLOW_RETRY is set.
+ */
+ if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
+ FAULT_FLAG_ALLOW_RETRY) {
+ fpin = get_file(vmf->vma->vm_file);
+ up_read(&vmf->vma->vm_mm->mmap_sem);
+ }
+ return fpin;
+}
+
#else /* !CONFIG_MMU */
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0a1b4b484ac5..cd480dce92c6 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1601,17 +1601,6 @@ static void collapse_file(struct mm_struct *mm,
result = SCAN_FAIL;
goto xa_unlocked;
}
- } else if (!PageUptodate(page)) {
- xas_unlock_irq(&xas);
- wait_on_page_locked(page);
- if (!trylock_page(page)) {
- result = SCAN_PAGE_LOCK;
- goto xa_unlocked;
- }
- get_page(page);
- } else if (PageDirty(page)) {
- result = SCAN_FAIL;
- goto xa_locked;
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
@@ -1626,7 +1615,12 @@ static void collapse_file(struct mm_struct *mm,
* without racing with truncate.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageUptodate(page), page);
+
+ /* double check the page is up to date */
+ if (unlikely(!PageUptodate(page))) {
+ result = SCAN_FAIL;
+ goto out_unlock;
+ }
/*
* If file was truncated then extended, or hole-punched, before
@@ -1642,6 +1636,24 @@ static void collapse_file(struct mm_struct *mm,
goto out_unlock;
}
+ if (!is_shmem && PageDirty(page)) {
+ /*
+ * khugepaged only works on read-only fd, so this
+ * page is dirty because it hasn't been flushed
+ * since first write. There won't be new dirty
+ * pages.
+ *
+ * Trigger async flush here and hope the writeback
+ * is done when khugepaged revisits this page.
+ *
+ * This is a one-off situation. We are not forcing
+ * writeback in loop.
+ */
+ filemap_flush(mapping);
+ result = SCAN_FAIL;
+ goto out_unlock;
+ }
+
if (isolate_lru_page(page)) {
result = SCAN_DEL_PAGE_LRU;
goto out_unlock;
diff --git a/mm/madvise.c b/mm/madvise.c
index 2be9f3fdb05e..99dd06fecfa9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -887,7 +887,7 @@ static int madvise_inject_error(int behavior,
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
pfn, start);
- ret = soft_offline_page(page, MF_COUNT_INCREASED);
+ ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
if (ret)
return ret;
continue;
diff --git a/mm/memblock.c b/mm/memblock.c
index c4b16cae2bc9..1f84be7d2b7d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1323,12 +1323,13 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
* @start: the lower bound of the memory region to allocate (phys address)
* @end: the upper bound of the memory region to allocate (phys address)
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @exact_nid: control the allocation fall back to other nodes
*
* The allocation is performed from memory region limited by
* memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE.
*
- * If the specified node can not hold the requested memory the
- * allocation falls back to any node in the system
+ * If the specified node can not hold the requested memory and @exact_nid
+ * is false, the allocation falls back to any node in the system
*
* For systems with memory mirroring, the allocation is attempted first
* from the regions with mirroring enabled and then retried from any
@@ -1342,7 +1343,8 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
*/
static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid)
+ phys_addr_t end, int nid,
+ bool exact_nid)
{
enum memblock_flags flags = choose_memblock_flags();
phys_addr_t found;
@@ -1362,7 +1364,7 @@ again:
if (found && !memblock_reserve(found, size))
goto done;
- if (nid != NUMA_NO_NODE) {
+ if (nid != NUMA_NO_NODE && !exact_nid) {
found = memblock_find_in_range_node(size, align, start,
end, NUMA_NO_NODE,
flags);
@@ -1410,7 +1412,8 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
phys_addr_t start,
phys_addr_t end)
{
- return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
+ return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
+ false);
}
/**
@@ -1429,7 +1432,7 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
{
return memblock_alloc_range_nid(size, align, 0,
- MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid, false);
}
/**
@@ -1439,6 +1442,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali
* @min_addr: the lower bound of the memory region to allocate (phys address)
* @max_addr: the upper bound of the memory region to allocate (phys address)
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @exact_nid: control the allocation fall back to other nodes
*
* Allocates memory block using memblock_alloc_range_nid() and
* converts the returned physical address to virtual.
@@ -1454,7 +1458,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali
static void * __init memblock_alloc_internal(
phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
- int nid)
+ int nid, bool exact_nid)
{
phys_addr_t alloc;
@@ -1469,11 +1473,13 @@ static void * __init memblock_alloc_internal(
if (max_addr > memblock.current_limit)
max_addr = memblock.current_limit;
- alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid);
+ alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid,
+ exact_nid);
/* retry allocation without lower limit */
if (!alloc && min_addr)
- alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid);
+ alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid,
+ exact_nid);
if (!alloc)
return NULL;
@@ -1482,6 +1488,43 @@ static void * __init memblock_alloc_internal(
}
/**
+ * memblock_alloc_exact_nid_raw - allocate boot memory block on the exact node
+ * without zeroing memory
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. Does not zero allocated memory.
+ *
+ * Return:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_alloc_exact_nid_raw(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ void *ptr;
+
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr,
+ &max_addr, (void *)_RET_IP_);
+
+ ptr = memblock_alloc_internal(size, align,
+ min_addr, max_addr, nid, true);
+ if (ptr && size > 0)
+ page_init_poison(ptr, size);
+
+ return ptr;
+}
+
+/**
* memblock_alloc_try_nid_raw - allocate boot memory block without zeroing
* memory and without panicking
* @size: size of memory block to be allocated in bytes
@@ -1512,7 +1555,7 @@ void * __init memblock_alloc_try_nid_raw(
&max_addr, (void *)_RET_IP_);
ptr = memblock_alloc_internal(size, align,
- min_addr, max_addr, nid);
+ min_addr, max_addr, nid, false);
if (ptr && size > 0)
page_init_poison(ptr, size);
@@ -1547,7 +1590,7 @@ void * __init memblock_alloc_try_nid(
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
ptr = memblock_alloc_internal(size, align,
- min_addr, max_addr, nid);
+ min_addr, max_addr, nid, false);
if (ptr)
memset(ptr, 0, size);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4b3865a1e4c1..ea085877c548 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -98,14 +98,6 @@ static bool do_memsw_account(void)
return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
}
-static const char *const mem_cgroup_lru_names[] = {
- "inactive_anon",
- "active_anon",
- "inactive_file",
- "active_file",
- "unevictable",
-};
-
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET 1024
@@ -1052,7 +1044,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup_per_node *mz;
mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
- iter = &mz->iter[reclaim->priority];
+ iter = &mz->iter;
if (prev && reclaim->generation != iter->generation)
goto out_unlock;
@@ -1152,15 +1144,11 @@ static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
struct mem_cgroup_reclaim_iter *iter;
struct mem_cgroup_per_node *mz;
int nid;
- int i;
for_each_node(nid) {
mz = mem_cgroup_nodeinfo(from, nid);
- for (i = 0; i <= DEF_PRIORITY; i++) {
- iter = &mz->iter[i];
- cmpxchg(&iter->position,
- dead_memcg, NULL);
- }
+ iter = &mz->iter;
+ cmpxchg(&iter->position, dead_memcg, NULL);
}
}
@@ -1438,7 +1426,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i],
+ seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
(u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
PAGE_SIZE);
@@ -1451,8 +1439,10 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
/* Accumulated memory events */
- seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
- seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
+ memcg_events(memcg, PGFAULT));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
+ memcg_events(memcg, PGMAJFAULT));
seq_buf_printf(&s, "workingset_refault %lu\n",
memcg_page_state(memcg, WORKINGSET_REFAULT));
@@ -1461,22 +1451,27 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
- seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
+ memcg_events(memcg, PGREFILL));
seq_buf_printf(&s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT));
seq_buf_printf(&s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT));
- seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
- seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
- seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
- seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
+ memcg_events(memcg, PGACTIVATE));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
+ memcg_events(memcg, PGDEACTIVATE));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
+ memcg_events(memcg, PGLAZYFREE));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
+ memcg_events(memcg, PGLAZYFREED));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- seq_buf_printf(&s, "thp_fault_alloc %lu\n",
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
memcg_events(memcg, THP_FAULT_ALLOC));
- seq_buf_printf(&s, "thp_collapse_alloc %lu\n",
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
memcg_events(memcg, THP_COLLAPSE_ALLOC));
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -1705,7 +1700,6 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
unsigned long nr_scanned;
struct mem_cgroup_reclaim_cookie reclaim = {
.pgdat = pgdat,
- .priority = 0,
};
excess = soft_limit_excess(root_memcg);
@@ -3849,13 +3843,6 @@ static const unsigned int memcg1_events[] = {
PGMAJFAULT,
};
-static const char *const memcg1_event_names[] = {
- "pgpgin",
- "pgpgout",
- "pgfault",
- "pgmajfault",
-};
-
static int memcg_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
@@ -3864,7 +3851,6 @@ static int memcg_stat_show(struct seq_file *m, void *v)
unsigned int i;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
- BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
@@ -3875,11 +3861,11 @@ static int memcg_stat_show(struct seq_file *m, void *v)
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "%s %lu\n", memcg1_event_names[i],
+ seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
memcg_events_local(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
+ seq_printf(m, "%s %lu\n", lru_list_name(i),
memcg_page_state_local(memcg, NR_LRU_BASE + i) *
PAGE_SIZE);
@@ -3904,11 +3890,12 @@ static int memcg_stat_show(struct seq_file *m, void *v)
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
+ seq_printf(m, "total_%s %llu\n",
+ vm_event_name(memcg1_events[i]),
(u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
+ seq_printf(m, "total_%s %llu\n", lru_list_name(i),
(u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
PAGE_SIZE);
@@ -5014,12 +5001,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- /*
- * Flush percpu vmstats and vmevents to guarantee the value correctness
- * on parent's and all ancestor levels.
- */
- memcg_flush_percpu_vmstats(memcg, false);
- memcg_flush_percpu_vmevents(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
@@ -5030,6 +5011,12 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
memcg_wb_domain_exit(memcg);
+ /*
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
+ * on parent's and all ancestor levels.
+ */
+ memcg_flush_percpu_vmstats(memcg, false);
+ memcg_flush_percpu_vmevents(memcg);
__mem_cgroup_free(memcg);
}
@@ -6087,7 +6074,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned long nr_pages;
+ unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ bool drained = false;
unsigned long high;
int err;
@@ -6098,12 +6086,29 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
memcg->high = high;
- nr_pages = page_counter_read(&memcg->memory);
- if (nr_pages > high)
- try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
- GFP_KERNEL, true);
+ for (;;) {
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
+ unsigned long reclaimed;
+
+ if (nr_pages <= high)
+ break;
+
+ if (signal_pending(current))
+ break;
+
+ if (!drained) {
+ drain_all_stock(memcg);
+ drained = true;
+ continue;
+ }
+
+ reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
+ GFP_KERNEL, true);
+
+ if (!reclaimed && !nr_retries--)
+ break;
+ }
- memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -6135,10 +6140,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_pages <= max)
break;
- if (signal_pending(current)) {
- err = -EINTR;
+ if (signal_pending(current))
break;
- }
if (!drained) {
drain_all_stock(memcg);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3151c87dff73..af2712004a4d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -303,25 +303,19 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
/*
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
- * TBD would GFP_NOIO be enough?
*/
static void add_to_kill(struct task_struct *tsk, struct page *p,
struct vm_area_struct *vma,
- struct list_head *to_kill,
- struct to_kill **tkc)
+ struct list_head *to_kill)
{
struct to_kill *tk;
- if (*tkc) {
- tk = *tkc;
- *tkc = NULL;
- } else {
- tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
- if (!tk) {
- pr_err("Memory failure: Out of memory while machine check handling\n");
- return;
- }
+ tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
+ if (!tk) {
+ pr_err("Memory failure: Out of memory while machine check handling\n");
+ return;
}
+
tk->addr = page_address_in_vma(p, vma);
if (is_zone_device_page(p))
tk->size_shift = dev_pagemap_mapping_shift(p, vma);
@@ -345,6 +339,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
kfree(tk);
return;
}
+
get_task_struct(tsk);
tk->tsk = tsk;
list_add_tail(&tk->nd, to_kill);
@@ -436,7 +431,7 @@ static struct task_struct *task_early_kill(struct task_struct *tsk,
* Collect processes when the error hit an anonymous page.
*/
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc, int force_early)
+ int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -461,7 +456,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (!page_mapped_in_vma(page, vma))
continue;
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill, tkc);
+ add_to_kill(t, page, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -472,7 +467,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
* Collect processes when the error hit a file mapped page.
*/
static void collect_procs_file(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc, int force_early)
+ int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -496,7 +491,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* to be informed of all such data corruptions.
*/
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill, tkc);
+ add_to_kill(t, page, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -505,26 +500,17 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
/*
* Collect the processes who have the corrupted page mapped to kill.
- * This is done in two steps for locking reasons.
- * First preallocate one tokill structure outside the spin locks,
- * so that we can kill at least one process reasonably reliable.
*/
static void collect_procs(struct page *page, struct list_head *tokill,
int force_early)
{
- struct to_kill *tk;
-
if (!page->mapping)
return;
- tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
- if (!tk)
- return;
if (PageAnon(page))
- collect_procs_anon(page, tokill, &tk, force_early);
+ collect_procs_anon(page, tokill, force_early);
else
- collect_procs_file(page, tokill, &tk, force_early);
- kfree(tk);
+ collect_procs_file(page, tokill, force_early);
}
static const char *action_name[] = {
@@ -1490,7 +1476,7 @@ static void memory_failure_work_func(struct work_struct *work)
if (!gotten)
break;
if (entry.flags & MF_SOFT_OFFLINE)
- soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
+ soft_offline_page(entry.pfn, entry.flags);
else
memory_failure(entry.pfn, entry.flags);
}
@@ -1871,7 +1857,7 @@ static int soft_offline_free_page(struct page *page)
/**
* soft_offline_page - Soft offline a page.
- * @page: page to offline
+ * @pfn: pfn to soft-offline
* @flags: flags. Same as memory_failure().
*
* Returns 0 on success, otherwise negated errno.
@@ -1891,18 +1877,17 @@ static int soft_offline_free_page(struct page *page)
* This is not a 100% solution for all memory, but tries to be
* ``good enough'' for the majority of memory.
*/
-int soft_offline_page(struct page *page, int flags)
+int soft_offline_page(unsigned long pfn, int flags)
{
int ret;
- unsigned long pfn = page_to_pfn(page);
+ struct page *page;
- if (is_zone_device_page(page)) {
- pr_debug_ratelimited("soft_offline: %#lx page is device page\n",
- pfn);
- if (flags & MF_COUNT_INCREASED)
- put_page(page);
+ if (!pfn_valid(pfn))
+ return -ENXIO;
+ /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
+ page = pfn_to_online_page(pfn);
+ if (!page)
return -EIO;
- }
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
diff --git a/mm/memory.c b/mm/memory.c
index b6a5d6a08438..7596d625ebd1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -72,6 +72,8 @@
#include <linux/oom.h>
#include <linux/numa.h>
+#include <trace/events/kmem.h>
+
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
@@ -152,6 +154,10 @@ static int __init init_zero_pfn(void)
}
core_initcall(init_zero_pfn);
+void mm_trace_rss_stat(int member, long count)
+{
+ trace_rss_stat(member, count);
+}
#if defined(SPLIT_RSS_COUNTING)
@@ -2289,10 +2295,11 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
*
* The function expects the page to be locked and unlocks it.
*/
-static void fault_dirty_shared_page(struct vm_area_struct *vma,
- struct page *page)
+static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
+ struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping;
+ struct page *page = vmf->page;
bool dirtied;
bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
@@ -2307,16 +2314,30 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma,
mapping = page_rmapping(page);
unlock_page(page);
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
+
+ /*
+ * Throttle page dirtying rate down to writeback speed.
+ *
+ * mapping may be NULL here because some device drivers do not
+ * set page.mapping but still dirty their pages
+ *
+ * Drop the mmap_sem before waiting on IO, if we can. The file
+ * is pinning the mapping, as per above.
+ */
if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
+ struct file *fpin;
+
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
balance_dirty_pages_ratelimited(mapping);
+ if (fpin) {
+ fput(fpin);
+ return VM_FAULT_RETRY;
+ }
}
- if (!page_mkwrite)
- file_update_time(vma->vm_file);
+ return 0;
}
/*
@@ -2571,6 +2592,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret = VM_FAULT_WRITE;
get_page(vmf->page);
@@ -2594,10 +2616,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
wp_page_reuse(vmf);
lock_page(vmf->page);
}
- fault_dirty_shared_page(vma, vmf->page);
+ ret |= fault_dirty_shared_page(vmf);
put_page(vmf->page);
- return VM_FAULT_WRITE;
+ return ret;
}
/*
@@ -3083,7 +3105,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
/*
* The memory barrier inside __SetPageUptodate makes sure that
- * preceeding stores to the page contents become visible before
+ * preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
__SetPageUptodate(page);
@@ -3641,7 +3663,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
return ret;
}
- fault_dirty_shared_page(vma, vmf->page);
+ ret |= fault_dirty_shared_page(vmf);
return ret;
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index df570e5c71cc..561371ead39a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -49,8 +49,6 @@
* and restore_online_page_callback() for generic callback restore.
*/
-static void generic_online_page(struct page *page, unsigned int order);
-
static online_page_callback_t online_page_callback = generic_online_page;
static DEFINE_MUTEX(online_page_callback_lock);
@@ -278,6 +276,22 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
return 0;
}
+static int check_hotplug_memory_addressable(unsigned long pfn,
+ unsigned long nr_pages)
+{
+ const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;
+
+ if (max_addr >> MAX_PHYSMEM_BITS) {
+ const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1;
+ WARN(1,
+ "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n",
+ (u64)PFN_PHYS(pfn), max_addr, max_allowed);
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
/*
* Reasonably generic function for adding memory. It is
* expected that archs that support memory hotplug will
@@ -291,6 +305,10 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
unsigned long nr, start_sec, end_sec;
struct vmem_altmap *altmap = restrictions->altmap;
+ err = check_hotplug_memory_addressable(pfn, nr_pages);
+ if (err)
+ return err;
+
if (altmap) {
/*
* Validate altmap is within bounds of the total request
@@ -331,13 +349,13 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long end_pfn)
{
for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_valid(start_pfn)))
+ if (unlikely(!pfn_to_online_page(start_pfn)))
continue;
if (unlikely(pfn_to_nid(start_pfn) != nid))
continue;
- if (zone && zone != page_zone(pfn_to_page(start_pfn)))
+ if (zone != page_zone(pfn_to_page(start_pfn)))
continue;
return start_pfn;
@@ -356,13 +374,13 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
/* pfn is the end pfn of a memory section. */
pfn = end_pfn - 1;
for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_valid(pfn)))
+ if (unlikely(!pfn_to_online_page(pfn)))
continue;
if (unlikely(pfn_to_nid(pfn) != nid))
continue;
- if (zone && zone != page_zone(pfn_to_page(pfn)))
+ if (zone != page_zone(pfn_to_page(pfn)))
continue;
return pfn;
@@ -374,14 +392,11 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long zone_start_pfn = zone->zone_start_pfn;
- unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
- unsigned long zone_end_pfn = z;
unsigned long pfn;
int nid = zone_to_nid(zone);
zone_span_writelock(zone);
- if (zone_start_pfn == start_pfn) {
+ if (zone->zone_start_pfn == start_pfn) {
/*
* If the section is smallest section in the zone, it need
* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
@@ -389,50 +404,30 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
* for shrinking zone.
*/
pfn = find_smallest_section_pfn(nid, zone, end_pfn,
- zone_end_pfn);
+ zone_end_pfn(zone));
if (pfn) {
+ zone->spanned_pages = zone_end_pfn(zone) - pfn;
zone->zone_start_pfn = pfn;
- zone->spanned_pages = zone_end_pfn - pfn;
+ } else {
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
}
- } else if (zone_end_pfn == end_pfn) {
+ } else if (zone_end_pfn(zone) == end_pfn) {
/*
* If the section is biggest section in the zone, it need
* shrink zone->spanned_pages.
* In this case, we find second biggest valid mem_section for
* shrinking zone.
*/
- pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+ pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
start_pfn);
if (pfn)
- zone->spanned_pages = pfn - zone_start_pfn + 1;
- }
-
- /*
- * The section is not biggest or smallest mem_section in the zone, it
- * only creates a hole in the zone. So in this case, we need not
- * change the zone. But perhaps, the zone has only hole data. Thus
- * it check the zone has only hole or not.
- */
- pfn = zone_start_pfn;
- for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_valid(pfn)))
- continue;
-
- if (page_zone(pfn_to_page(pfn)) != zone)
- continue;
-
- /* Skip range to be removed */
- if (pfn >= start_pfn && pfn < end_pfn)
- continue;
-
- /* If we find valid section, we have nothing to do */
- zone_span_writeunlock(zone);
- return;
+ zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
+ else {
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
+ }
}
-
- /* The zone has no valid section */
- zone->zone_start_pfn = 0;
- zone->spanned_pages = 0;
zone_span_writeunlock(zone);
}
@@ -457,34 +452,50 @@ static void update_pgdat_span(struct pglist_data *pgdat)
pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
}
-static void __remove_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages)
+void __ref remove_pfn_range_from_zone(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long nr_pages)
{
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long flags;
+ /* Poison struct pages because they are now uninitialized again. */
+ page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
+
+#ifdef CONFIG_ZONE_DEVICE
+ /*
+ * Zone shrinking code cannot properly deal with ZONE_DEVICE. So
+ * we will not try to shrink the zones - which is okay as
+ * set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
+ */
+ if (zone_idx(zone) == ZONE_DEVICE)
+ return;
+#endif
+
+ clear_zone_contiguous(zone);
+
pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
update_pgdat_span(pgdat);
pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+ set_zone_contiguous(zone);
}
-static void __remove_section(struct zone *zone, unsigned long pfn,
- unsigned long nr_pages, unsigned long map_offset,
- struct vmem_altmap *altmap)
+static void __remove_section(unsigned long pfn, unsigned long nr_pages,
+ unsigned long map_offset,
+ struct vmem_altmap *altmap)
{
struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
if (WARN_ON_ONCE(!valid_section(ms)))
return;
- __remove_zone(zone, pfn, nr_pages);
sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
}
/**
- * __remove_pages() - remove sections of pages from a zone
- * @zone: zone from which pages need to be removed
+ * __remove_pages() - remove sections of pages
* @pfn: starting pageframe (must be aligned to start of a section)
* @nr_pages: number of pages to remove (must be multiple of section size)
* @altmap: alternative device page map or %NULL if default memmap is used
@@ -494,34 +505,25 @@ static void __remove_section(struct zone *zone, unsigned long pfn,
* sure that pages are marked reserved and zones are adjust properly by
* calling offline_pages().
*/
-void __remove_pages(struct zone *zone, unsigned long pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+void __remove_pages(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap)
{
+ const unsigned long end_pfn = pfn + nr_pages;
+ unsigned long cur_nr_pages;
unsigned long map_offset = 0;
- unsigned long nr, start_sec, end_sec;
map_offset = vmem_altmap_offset(altmap);
- clear_zone_contiguous(zone);
-
if (check_pfn_span(pfn, nr_pages, "remove"))
return;
- start_sec = pfn_to_section_nr(pfn);
- end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
- for (nr = start_sec; nr <= end_sec; nr++) {
- unsigned long pfns;
-
+ for (; pfn < end_pfn; pfn += cur_nr_pages) {
cond_resched();
- pfns = min(nr_pages, PAGES_PER_SECTION
- - (pfn & ~PAGE_SECTION_MASK));
- __remove_section(zone, pfn, pfns, map_offset, altmap);
- pfn += pfns;
- nr_pages -= pfns;
+ /* Select all remaining pages up to the next section boundary */
+ cur_nr_pages = min(end_pfn - pfn, -(pfn | PAGE_SECTION_MASK));
+ __remove_section(pfn, cur_nr_pages, map_offset, altmap);
map_offset = 0;
}
-
- set_zone_contiguous(zone);
}
int set_online_page_callback(online_page_callback_t callback)
@@ -562,24 +564,7 @@ int restore_online_page_callback(online_page_callback_t callback)
}
EXPORT_SYMBOL_GPL(restore_online_page_callback);
-void __online_page_set_limits(struct page *page)
-{
-}
-EXPORT_SYMBOL_GPL(__online_page_set_limits);
-
-void __online_page_increment_counters(struct page *page)
-{
- adjust_managed_page_count(page, 1);
-}
-EXPORT_SYMBOL_GPL(__online_page_increment_counters);
-
-void __online_page_free(struct page *page)
-{
- __free_reserved_page(page);
-}
-EXPORT_SYMBOL_GPL(__online_page_free);
-
-static void generic_online_page(struct page *page, unsigned int order)
+void generic_online_page(struct page *page, unsigned int order)
{
kernel_map_pages(page, 1 << order, 1);
__free_pages_core(page, order);
@@ -589,6 +574,7 @@ static void generic_online_page(struct page *page, unsigned int order)
totalhigh_pages_add(1UL << order);
#endif
}
+EXPORT_SYMBOL_GPL(generic_online_page);
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
@@ -849,6 +835,7 @@ failed_addition:
(unsigned long long) pfn << PAGE_SHIFT,
(((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_ONLINE, &arg);
+ remove_pfn_range_from_zone(zone, pfn, nr_pages);
mem_hotplug_done();
return ret;
}
@@ -1162,7 +1149,8 @@ static bool is_pageblock_removable_nolock(unsigned long pfn)
if (!zone_spans_pfn(zone, pfn))
return false;
- return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON);
+ return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE,
+ MEMORY_OFFLINE);
}
/* Checks if this range of memory is likely to be hot-removable. */
@@ -1359,9 +1347,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
return ret;
}
-/*
- * remove from free_area[] and mark all as Reserved.
- */
+/* Mark all sections offline and remove all free pages from the buddy. */
static int
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
void *data)
@@ -1379,7 +1365,8 @@ static int
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
void *data)
{
- return test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
+ return test_pages_isolated(start_pfn, start_pfn + nr_pages,
+ MEMORY_OFFLINE);
}
static int __init cmdline_parse_movable_node(char *p)
@@ -1490,7 +1477,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
- SKIP_HWPOISON | REPORT_FAILURE);
+ MEMORY_OFFLINE | REPORT_FAILURE);
if (ret < 0) {
reason = "failure to isolate range";
goto failed_removal;
@@ -1584,6 +1571,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
+ remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
mem_hotplug_done();
return 0;
@@ -1721,13 +1709,13 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
- memblock_free(start, size);
- memblock_remove(start, size);
/* remove memory block devices before removing memory */
remove_memory_block_devices(start, size);
arch_remove_memory(nid, start, size, NULL);
+ memblock_free(start, size);
+ memblock_remove(start, size);
__release_memory_resource(start, size);
try_offline_node(nid);
diff --git a/mm/memremap.c b/mm/memremap.c
index 03ccbdfeb697..022e78e68ea0 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -55,8 +55,16 @@ static void pgmap_array_delete(struct resource *res)
static unsigned long pfn_first(struct dev_pagemap *pgmap)
{
- return PHYS_PFN(pgmap->res.start) +
- vmem_altmap_offset(pgmap_altmap(pgmap));
+ const struct resource *res = &pgmap->res;
+ struct vmem_altmap *altmap = pgmap_altmap(pgmap);
+ unsigned long pfn;
+
+ if (altmap)
+ pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ else
+ pfn = PHYS_PFN(res->start);
+
+ return pfn;
}
static unsigned long pfn_end(struct dev_pagemap *pgmap)
@@ -73,6 +81,26 @@ static unsigned long pfn_next(unsigned long pfn)
return pfn + 1;
}
+/*
+ * This returns true if the page is reserved by ZONE_DEVICE driver.
+ */
+bool pfn_zone_device_reserved(unsigned long pfn)
+{
+ struct dev_pagemap *pgmap;
+ struct vmem_altmap *altmap;
+ bool ret = false;
+
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (!pgmap)
+ return ret;
+ altmap = pgmap_altmap(pgmap);
+ if (altmap && pfn < (altmap->base_pfn + altmap->reserve))
+ ret = true;
+ put_dev_pagemap(pgmap);
+
+ return ret;
+}
+
#define for_each_device_pfn(pfn, map) \
for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
@@ -119,8 +147,10 @@ void memunmap_pages(struct dev_pagemap *pgmap)
nid = page_to_nid(first_page);
mem_hotplug_begin();
+ remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)));
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- __remove_pages(page_zone(first_page), PHYS_PFN(res->start),
+ __remove_pages(PHYS_PFN(res->start),
PHYS_PFN(resource_size(res)), NULL);
} else {
arch_remove_memory(nid, res->start, resource_size(res),
diff --git a/mm/mmap.c b/mm/mmap.c
index a7d8c84d19b7..d5ca73facd5d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -641,7 +641,7 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node **rb_link,
struct rb_node *rb_parent)
{
- __vma_link_list(mm, vma, prev, rb_parent);
+ __vma_link_list(mm, vma, prev);
__vma_link_rb(mm, vma, rb_link, rb_parent);
}
@@ -684,37 +684,14 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
static __always_inline void __vma_unlink_common(struct mm_struct *mm,
struct vm_area_struct *vma,
- struct vm_area_struct *prev,
- bool has_prev,
struct vm_area_struct *ignore)
{
- struct vm_area_struct *next;
-
vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
- next = vma->vm_next;
- if (has_prev)
- prev->vm_next = next;
- else {
- prev = vma->vm_prev;
- if (prev)
- prev->vm_next = next;
- else
- mm->mmap = next;
- }
- if (next)
- next->vm_prev = prev;
-
+ __vma_unlink_list(mm, vma);
/* Kill the cache */
vmacache_invalidate(mm);
}
-static inline void __vma_unlink_prev(struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct vm_area_struct *prev)
-{
- __vma_unlink_common(mm, vma, prev, true, vma);
-}
-
/*
* We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
* is already present in an i_mmap tree without adjusting the tree.
@@ -769,8 +746,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
remove_next = 1 + (end > next->vm_end);
VM_WARN_ON(remove_next == 2 &&
end != next->vm_next->vm_end);
- VM_WARN_ON(remove_next == 1 &&
- end != next->vm_end);
/* trim end to next, for case 6 first pass */
end = next->vm_end;
}
@@ -889,7 +864,7 @@ again:
* us to remove next before dropping the locks.
*/
if (remove_next != 3)
- __vma_unlink_prev(mm, next, vma);
+ __vma_unlink_common(mm, next, next);
else
/*
* vma is not before next if they've been
@@ -900,7 +875,7 @@ again:
* "next" (which is stored in post-swap()
* "vma").
*/
- __vma_unlink_common(mm, next, NULL, false, vma);
+ __vma_unlink_common(mm, next, vma);
if (file)
__remove_shared_vm_struct(next, file, mapping);
} else if (insert) {
@@ -1442,7 +1417,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
* that it represents a valid section of the address space.
*/
addr = get_unmapped_area(file, addr, len, pgoff, flags);
- if (offset_in_page(addr))
+ if (IS_ERR_VALUE(addr))
return addr;
if (flags & MAP_FIXED_NOREPLACE) {
@@ -1884,6 +1859,22 @@ unacct_error:
return error;
}
+static inline unsigned long gap_start_offset(struct vm_unmapped_area_info *info,
+ unsigned long addr)
+{
+ /* get gap_start offset to adjust gap address to the
+ * desired alignment
+ */
+ return (info->align_offset - addr) & info->align_mask;
+}
+
+static inline unsigned long gap_end_offset(struct vm_unmapped_area_info *info,
+ unsigned long addr)
+{
+ /* get gap_end offset to adjust gap address to the desired alignment */
+ return (addr - info->align_offset) & info->align_mask;
+}
+
unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
/*
@@ -1898,10 +1889,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
- if (length < info->length)
- return -ENOMEM;
+ length = info->length;
/* Adjust search limits by the desired length */
if (info->high_limit < length)
@@ -1933,6 +1921,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
}
gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
+ gap_start += gap_start_offset(info, gap_start);
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
@@ -1961,6 +1950,7 @@ check_current:
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_left) {
gap_start = vm_end_gap(vma->vm_prev);
+ gap_start += gap_start_offset(info, gap_start);
gap_end = vm_start_gap(vma);
goto check_current;
}
@@ -1970,17 +1960,17 @@ check_current:
check_highest:
/* Check highest gap, which does not precede any rbtree node */
gap_start = mm->highest_vm_end;
+ gap_start += gap_start_offset(info, gap_start);
gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
if (gap_start > high_limit)
return -ENOMEM;
found:
/* We found a suitable gap. Clip it with the original low_limit. */
- if (gap_start < info->low_limit)
+ if (gap_start < info->low_limit) {
gap_start = info->low_limit;
-
- /* Adjust gap address to the desired alignment */
- gap_start += (info->align_offset - gap_start) & info->align_mask;
+ gap_start += gap_start_offset(info, gap_start);
+ }
VM_BUG_ON(gap_start + info->length > info->high_limit);
VM_BUG_ON(gap_start + info->length > gap_end);
@@ -1993,16 +1983,14 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
- if (length < info->length)
- return -ENOMEM;
+ length = info->length;
/*
* Adjust search limits by the desired length.
* See implementation comment at top of unmapped_area().
*/
gap_end = info->high_limit;
+ gap_end -= gap_end_offset(info, gap_end);
if (gap_end < length)
return -ENOMEM;
high_limit = gap_end - length;
@@ -2039,6 +2027,7 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
check_current:
/* Check if current node has a suitable gap */
gap_end = vm_start_gap(vma);
+ gap_end -= gap_end_offset(info, gap_end);
if (gap_end < low_limit)
return -ENOMEM;
if (gap_start <= high_limit &&
@@ -2073,13 +2062,14 @@ check_current:
found:
/* We found a suitable gap. Clip it with the original high_limit. */
- if (gap_end > info->high_limit)
+ if (gap_end > info->high_limit) {
gap_end = info->high_limit;
+ gap_end -= gap_end_offset(info, gap_end);
+ }
found_highest:
/* Compute highest gap address at the desired alignment */
gap_end -= info->length;
- gap_end -= (gap_end - info->align_offset) & info->align_mask;
VM_BUG_ON(gap_end < info->low_limit);
VM_BUG_ON(gap_end < gap_start);
@@ -3006,15 +2996,16 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
+ unsigned long mapped_addr;
/* Until we need other flags, refuse anything except VM_EXEC. */
if ((flags & (~VM_EXEC)) != 0)
return -EINVAL;
flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
- error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- if (offset_in_page(error))
- return error;
+ mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ if (IS_ERR_VALUE(mapped_addr))
+ return mapped_addr;
error = mlock_future_check(mm, mm->def_flags, len);
if (error)
diff --git a/mm/mremap.c b/mm/mremap.c
index 1fc8a29fbe3f..122938dcec15 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -558,7 +558,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
((addr - vma->vm_start) >> PAGE_SHIFT),
map_flags);
- if (offset_in_page(ret))
+ if (IS_ERR_VALUE(ret))
goto out1;
ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
@@ -706,7 +706,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
vma->vm_pgoff +
((addr - vma->vm_start) >> PAGE_SHIFT),
map_flags);
- if (offset_in_page(new_addr)) {
+ if (IS_ERR_VALUE(new_addr)) {
ret = new_addr;
goto out;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 99b7ec318824..21ddf689d4fa 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -637,7 +637,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
if (rb_prev)
prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
- __vma_link_list(mm, vma, prev, parent);
+ __vma_link_list(mm, vma, prev);
}
/*
@@ -673,13 +673,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
/* remove from the MM's tree and list */
rb_erase(&vma->vm_rb, &mm->mm_rb);
- if (vma->vm_prev)
- vma->vm_prev->vm_next = vma->vm_next;
- else
- mm->mmap = vma->vm_next;
-
- if (vma->vm_next)
- vma->vm_next->vm_prev = vma->vm_prev;
+ __vma_unlink_list(mm, vma);
}
/*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 71e3acea7817..314ce1a3cf25 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -377,36 +377,13 @@ static void select_bad_process(struct oom_control *oc)
}
}
-static int dump_task(struct task_struct *p, void *arg)
-{
- struct oom_control *oc = arg;
- struct task_struct *task;
-
- if (oom_unkillable_task(p))
- return 0;
-
- /* p may not have freeable memory in nodemask */
- if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
- return 0;
- task = find_lock_task_mm(p);
- if (!task) {
- /*
- * This is a kthread or all of p's threads have already
- * detached their mm's. There's no need to report
- * them; they can't be oom killed anyway.
- */
- return 0;
+static int add_candidate_task(struct task_struct *p, void *arg)
+{
+ if (!oom_unkillable_task(p)) {
+ get_task_struct(p);
+ list_add_tail(&p->oom_victim_list, (struct list_head *) arg);
}
-
- pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
- task->pid, from_kuid(&init_user_ns, task_uid(task)),
- task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
- mm_pgtables_bytes(task->mm),
- get_mm_counter(task->mm, MM_SWAPENTS),
- task->signal->oom_score_adj, task->comm);
- task_unlock(task);
-
return 0;
}
@@ -422,19 +399,41 @@ static int dump_task(struct task_struct *p, void *arg)
*/
static void dump_tasks(struct oom_control *oc)
{
- pr_info("Tasks state (memory values in pages):\n");
- pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
+ LIST_HEAD(list);
+ struct task_struct *p;
+ struct task_struct *t;
if (is_memcg_oom(oc))
- mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
+ mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, &list);
else {
- struct task_struct *p;
-
rcu_read_lock();
for_each_process(p)
- dump_task(p, oc);
+ add_candidate_task(p, &list);
rcu_read_unlock();
}
+ pr_info("Tasks state (memory values in pages):\n");
+ pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
+ list_for_each_entry(p, &list, oom_victim_list) {
+ cond_resched();
+ /* p may not have freeable memory in nodemask */
+ if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
+ continue;
+ /* All of p's threads might have already detached their mm's. */
+ t = find_lock_task_mm(p);
+ if (!t)
+ continue;
+ pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
+ t->pid, from_kuid(&init_user_ns, task_uid(t)),
+ t->tgid, t->mm->total_vm, get_mm_rss(t->mm),
+ mm_pgtables_bytes(t->mm),
+ get_mm_counter(t->mm, MM_SWAPENTS),
+ t->signal->oom_score_adj, t->comm);
+ task_unlock(t);
+ }
+ list_for_each_entry_safe(p, t, &list, oom_victim_list) {
+ list_del(&p->oom_victim_list);
+ put_task_struct(p);
+ }
}
static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ecc3dbad606b..12f3ce09d33d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1948,6 +1948,14 @@ void __init page_alloc_init_late(void)
wait_for_completion(&pgdat_init_all_done_comp);
/*
+ * The number of managed pages has changed due to the initialisation
+ * so the pcpu batch and high limits needs to be updated or the limits
+ * will be artificially small.
+ */
+ for_each_populated_zone(zone)
+ zone_pcp_update(zone);
+
+ /*
* We initialized the rest of the deferred pages. Permanently disable
* on-demand struct page initialization.
*/
@@ -5944,10 +5952,10 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
#ifdef CONFIG_ZONE_DEVICE
void __ref memmap_init_zone_device(struct zone *zone,
unsigned long start_pfn,
- unsigned long size,
+ unsigned long nr_pages,
struct dev_pagemap *pgmap)
{
- unsigned long pfn, end_pfn = start_pfn + size;
+ unsigned long pfn, end_pfn = start_pfn + nr_pages;
struct pglist_data *pgdat = zone->zone_pgdat;
struct vmem_altmap *altmap = pgmap_altmap(pgmap);
unsigned long zone_idx = zone_idx(zone);
@@ -5964,7 +5972,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
*/
if (altmap) {
start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
- size = end_pfn - start_pfn;
+ nr_pages = end_pfn - start_pfn;
}
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
@@ -6011,7 +6019,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
}
pr_info("%s initialised %lu pages in %ums\n", __func__,
- size, jiffies_to_msecs(jiffies - start));
+ nr_pages, jiffies_to_msecs(jiffies - start));
}
#endif
@@ -7985,6 +7993,15 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
+static void __zone_pcp_update(struct zone *zone)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ pageset_set_high_and_batch(zone,
+ per_cpu_ptr(zone->pageset, cpu));
+}
+
/*
* percpu_pagelist_fraction - changes the pcp->high for each zone on each
* cpu. It is the fraction of total pages in each zone that a hot per cpu
@@ -8016,13 +8033,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
goto out;
- for_each_populated_zone(zone) {
- unsigned int cpu;
-
- for_each_possible_cpu(cpu)
- pageset_set_high_and_batch(zone,
- per_cpu_ptr(zone->pageset, cpu));
- }
+ for_each_populated_zone(zone)
+ __zone_pcp_update(zone);
out:
mutex_unlock(&pcp_batch_high_lock);
return ret;
@@ -8258,7 +8270,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* The HWPoisoned page may be not in buddy system, and
* page_count() is not 0.
*/
- if ((flags & SKIP_HWPOISON) && PageHWPoison(page))
+ if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
continue;
if (__PageMovable(page))
@@ -8474,7 +8486,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
}
/* Make sure the range is really isolated. */
- if (test_pages_isolated(outer_start, end, false)) {
+ if (test_pages_isolated(outer_start, end, 0)) {
pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
__func__, outer_start, end);
ret = -EBUSY;
@@ -8499,6 +8511,107 @@ done:
pfn_max_align_up(end), migratetype);
return ret;
}
+
+static int __alloc_contig_pages(unsigned long start_pfn,
+ unsigned long nr_pages, gfp_t gfp_mask)
+{
+ unsigned long end_pfn = start_pfn + nr_pages;
+
+ return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
+ gfp_mask);
+}
+
+static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i, end_pfn = start_pfn + nr_pages;
+ struct page *page;
+
+ for (i = start_pfn; i < end_pfn; i++) {
+ page = pfn_to_online_page(i);
+ if (!page)
+ return false;
+
+ if (page_zone(page) != z)
+ return false;
+
+ if (PageReserved(page))
+ return false;
+
+ if (page_count(page) > 0)
+ return false;
+
+ if (PageHuge(page))
+ return false;
+ }
+ return true;
+}
+
+static bool zone_spans_last_pfn(const struct zone *zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ unsigned long last_pfn = start_pfn + nr_pages - 1;
+
+ return zone_spans_pfn(zone, last_pfn);
+}
+
+/**
+ * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
+ * @nr_pages: Number of contiguous pages to allocate
+ * @gfp_mask: GFP mask to limit search and used during compaction
+ * @nid: Target node
+ * @nodemask: Mask for other possible nodes
+ *
+ * This routine is a wrapper around alloc_contig_range(). It scans over zones
+ * on an applicable zonelist to find a contiguous pfn range which can then be
+ * tried for allocation with alloc_contig_range(). This routine is intended
+ * for allocation requests which can not be fulfilled with the buddy allocator.
+ *
+ * The allocated memory is always aligned to a page boundary. If nr_pages is a
+ * power of two then the alignment is guaranteed to be to the given nr_pages
+ * (e.g. 1GB request would be aligned to 1GB).
+ *
+ * Allocated pages can be freed with free_contig_range() or by manually calling
+ * __free_page() on each allocated page.
+ *
+ * Return: pointer to contiguous pages on success, or NULL if not successful.
+ */
+struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
+{
+ unsigned long ret, pfn, flags;
+ struct zonelist *zonelist;
+ struct zone *zone;
+ struct zoneref *z;
+
+ zonelist = node_zonelist(nid, gfp_mask);
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(gfp_mask), nodemask) {
+ spin_lock_irqsave(&zone->lock, flags);
+
+ pfn = ALIGN(zone->zone_start_pfn, nr_pages);
+ while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
+ if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
+ /*
+ * We release the zone lock here because
+ * alloc_contig_range() will also lock the zone
+ * at some point. If there's an allocation
+ * spinning on this lock, it may win the race
+ * and cause alloc_contig_range() to fail...
+ */
+ spin_unlock_irqrestore(&zone->lock, flags);
+ ret = __alloc_contig_pages(pfn, nr_pages,
+ gfp_mask);
+ if (!ret)
+ return pfn_to_page(pfn);
+ spin_lock_irqsave(&zone->lock, flags);
+ }
+ pfn += nr_pages;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+ return NULL;
+}
#endif /* CONFIG_CONTIG_ALLOC */
void free_contig_range(unsigned long pfn, unsigned int nr_pages)
@@ -8514,21 +8627,16 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages)
WARN(count != 0, "%d pages are still in use!\n", count);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
/*
* The zone indicated has a new number of managed_pages; batch sizes and percpu
* page high values need to be recalulated.
*/
void __meminit zone_pcp_update(struct zone *zone)
{
- unsigned cpu;
mutex_lock(&pcp_batch_high_lock);
- for_each_possible_cpu(cpu)
- pageset_set_high_and_batch(zone,
- per_cpu_ptr(zone->pageset, cpu));
+ __zone_pcp_update(zone);
mutex_unlock(&pcp_batch_high_lock);
}
-#endif
void zone_pcp_reset(struct zone *zone)
{
@@ -8559,7 +8667,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
struct page *page;
struct zone *zone;
- unsigned int order, i;
+ unsigned int order;
unsigned long pfn;
unsigned long flags;
unsigned long offlined_pages = 0;
@@ -8587,7 +8695,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
*/
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
pfn++;
- SetPageReserved(page);
offlined_pages++;
continue;
}
@@ -8601,8 +8708,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
pfn, 1 << order, end_pfn);
#endif
del_page_from_free_area(page, &zone->free_area[order]);
- for (i = 0; i < (1 << order); i++)
- SetPageReserved((page+i));
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
diff --git a/mm/page_io.c b/mm/page_io.c
index 24ee600f9131..83db25aee6f1 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -22,6 +22,7 @@
#include <linux/writeback.h>
#include <linux/frontswap.h>
#include <linux/blkdev.h>
+#include <linux/psi.h>
#include <linux/uio.h>
#include <linux/sched/task.h>
#include <asm/pgtable.h>
@@ -354,10 +355,19 @@ int swap_readpage(struct page *page, bool synchronous)
struct swap_info_struct *sis = page_swap_info(page);
blk_qc_t qc;
struct gendisk *disk;
+ unsigned long pflags;
VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageUptodate(page), page);
+
+ /*
+ * Count submission time as memory stall. When the device is congested,
+ * or the submitting cgroup IO-throttled, submission can be a
+ * significant part of overall IO time.
+ */
+ psi_memstall_enter(&pflags);
+
if (frontswap_load(page) == 0) {
SetPageUptodate(page);
unlock_page(page);
@@ -371,7 +381,7 @@ int swap_readpage(struct page *page, bool synchronous)
ret = mapping->a_ops->readpage(swap_file, page);
if (!ret)
count_vm_event(PSWPIN);
- return ret;
+ goto out;
}
ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
@@ -382,7 +392,7 @@ int swap_readpage(struct page *page, bool synchronous)
}
count_vm_event(PSWPIN);
- return 0;
+ goto out;
}
ret = 0;
@@ -418,6 +428,7 @@ int swap_readpage(struct page *page, bool synchronous)
bio_put(bio);
out:
+ psi_memstall_leave(&pflags);
return ret;
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 89c19c0feadb..04ee1663cdbe 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -168,7 +168,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* @migratetype: Migrate type to set in error recovery.
* @flags: The following flags are allowed (they can be combined in
* a bit mask)
- * SKIP_HWPOISON - ignore hwpoison pages
+ * MEMORY_OFFLINE - isolate to offline (!allocate) memory
+ * e.g., skip over PageHWPoison() pages
* REPORT_FAILURE - report details about the failure to
* isolate the range
*
@@ -257,7 +258,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
*/
static unsigned long
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
- bool skip_hwpoisoned_pages)
+ int flags)
{
struct page *page;
@@ -274,7 +275,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* simple way to verify that as VM_BUG_ON(), though.
*/
pfn += 1 << page_order(page);
- else if (skip_hwpoisoned_pages && PageHWPoison(page))
+ else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
/* A HWPoisoned page cannot be also PageBuddy */
pfn++;
else
@@ -286,7 +287,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
/* Caller should ensure that requested range is in a single zone */
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
- bool skip_hwpoisoned_pages)
+ int isol_flags)
{
unsigned long pfn, flags;
struct page *page;
@@ -308,8 +309,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
/* Check all pages are free or marked as ISOLATED */
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
- pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
- skip_hwpoisoned_pages);
+ pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags);
spin_unlock_irqrestore(&zone->lock, flags);
trace_test_pages_isolated(start_pfn, end_pfn, pfn);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 532c29276fce..3d7c01e76efc 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -24,18 +24,27 @@ void pgd_clear_bad(pgd_t *pgd)
pgd_clear(pgd);
}
+#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *p4d)
{
p4d_ERROR(*p4d);
p4d_clear(p4d);
}
+#endif
+#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *pud)
{
pud_ERROR(*pud);
pud_clear(pud);
}
+#endif
+/*
+ * Note that the pmd variant below can't be stub'ed out just as for p4d/pud
+ * above. pmd folding is special and typically pmd_* macros refer to upper
+ * level even when folded
+ */
void pmd_clear_bad(pmd_t *pmd)
{
pmd_ERROR(*pmd);
diff --git a/mm/rmap.c b/mm/rmap.c
index 0c7b2a9400d4..2487c9dd9649 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -251,18 +251,37 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
* Attach the anon_vmas from src to dst.
* Returns 0 on success, -ENOMEM on failure.
*
- * If dst->anon_vma is NULL this function tries to find and reuse existing
- * anon_vma which has no vmas and only one child anon_vma. This prevents
- * degradation of anon_vma hierarchy to endless linear chain in case of
- * constantly forking task. On the other hand, an anon_vma with more than one
- * child isn't reused even if there was no alive vma, thus rmap walker has a
- * good chance of avoiding scanning the whole hierarchy when it searches where
- * page is mapped.
+ * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
+ * anon_vma_fork(). The first three want an exact copy of src, while the last
+ * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
+ * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
+ * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
+ *
+ * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
+ * and reuse existing anon_vma which has no vmas and only one child anon_vma.
+ * This prevents degradation of anon_vma hierarchy to endless linear chain in
+ * case of constantly forking task. On the other hand, an anon_vma with more
+ * than one child isn't reused even if there was no alive vma, thus rmap
+ * walker has a good chance of avoiding scanning the whole hierarchy when it
+ * searches where page is mapped.
*/
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
struct anon_vma_chain *avc, *pavc;
struct anon_vma *root = NULL;
+ struct vm_area_struct *prev = dst->vm_prev, *pprev = src->vm_prev;
+
+ /*
+ * If parent share anon_vma with its vm_prev, keep this sharing in in
+ * child.
+ *
+ * 1. Parent has vm_prev, which implies we have vm_prev.
+ * 2. Parent and its vm_prev have the same anon_vma.
+ */
+ if (!dst->anon_vma && src->anon_vma &&
+ pprev && pprev->anon_vma == src->anon_vma)
+ dst->anon_vma = prev->anon_vma;
+
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma;
@@ -287,8 +306,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
* will always reuse it. Root anon_vma is never reused:
* it has self-parent reference and at least one child.
*/
- if (!dst->anon_vma && anon_vma != src->anon_vma &&
- anon_vma->degree < 2)
+ if (!dst->anon_vma && src->anon_vma &&
+ anon_vma != src->anon_vma && anon_vma->degree < 2)
dst->anon_vma = anon_vma;
}
if (dst->anon_vma)
@@ -458,9 +477,10 @@ void __init anon_vma_init(void)
* chain and verify that the page in question is indeed mapped in it
* [ something equivalent to page_mapped_in_vma() ].
*
- * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
- * that the anon_vma pointer from page->mapping is valid if there is a
- * mapcount, we can dereference the anon_vma after observing those.
+ * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
+ * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
+ * if there is a mapcount, we can dereference the anon_vma after observing
+ * those.
*/
struct anon_vma *page_get_anon_vma(struct page *page)
{
@@ -1273,12 +1293,20 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (TestClearPageDoubleMap(page)) {
/*
* Subpages can be mapped with PTEs too. Check how many of
- * themi are still mapped.
+ * them are still mapped.
*/
for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
+
+ /*
+ * Queue the page for deferred split if at least one small
+ * page of the compound page is unmapped, but at least one
+ * small page is still mapped.
+ */
+ if (nr && nr < HPAGE_PMD_NR)
+ deferred_split_huge_page(page);
} else {
nr = HPAGE_PMD_NR;
}
@@ -1286,10 +1314,8 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
- if (nr) {
+ if (nr)
__mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
- deferred_split_huge_page(page);
- }
}
/**
diff --git a/mm/shmem.c b/mm/shmem.c
index 220be9fa2c41..447fd575587c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2022,16 +2022,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
shmem_falloc->waitq &&
vmf->pgoff >= shmem_falloc->start &&
vmf->pgoff < shmem_falloc->next) {
+ struct file *fpin;
wait_queue_head_t *shmem_falloc_waitq;
DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
ret = VM_FAULT_NOPAGE;
- if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
- !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
- /* It's polite to up mmap_sem if we can */
- up_read(&vma->vm_mm->mmap_sem);
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
+ if (fpin)
ret = VM_FAULT_RETRY;
- }
shmem_falloc_waitq = shmem_falloc->waitq;
prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
@@ -2049,6 +2047,9 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
spin_lock(&inode->i_lock);
finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
spin_unlock(&inode->i_lock);
+
+ if (fpin)
+ fput(fpin);
return ret;
}
spin_unlock(&inode->i_lock);
@@ -3928,7 +3929,7 @@ out2:
static ssize_t shmem_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- int values[] = {
+ static const int values[] = {
SHMEM_HUGE_ALWAYS,
SHMEM_HUGE_WITHIN_SIZE,
SHMEM_HUGE_ADVISE,
diff --git a/mm/slab.c b/mm/slab.c
index 66e5d8032bae..f1e1840af533 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1247,9 +1247,10 @@ void __init kmem_cache_init(void)
* structures first. Without this, further allocations will bug.
*/
kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
- kmalloc_info[INDEX_NODE].name,
- kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS,
- 0, kmalloc_size(INDEX_NODE));
+ kmalloc_info[INDEX_NODE].name[KMALLOC_NORMAL],
+ kmalloc_info[INDEX_NODE].size,
+ ARCH_KMALLOC_FLAGS, 0,
+ kmalloc_info[INDEX_NODE].size);
slab_state = PARTIAL_NODE;
setup_kmalloc_cache_index_table();
diff --git a/mm/slab.h b/mm/slab.h
index 68e455f2b698..3eb29ae75743 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -139,7 +139,7 @@ extern struct kmem_cache *kmem_cache;
/* A table of kmalloc cache names and sizes */
extern const struct kmalloc_info_struct {
- const char *name;
+ const char *name[NR_KMALLOC_TYPES];
unsigned int size;
} kmalloc_info[];
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f9fb27b4c843..8afa188f6e20 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1139,26 +1139,56 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
return kmalloc_caches[kmalloc_type(flags)][index];
}
+#ifdef CONFIG_ZONE_DMA
+#define INIT_KMALLOC_INFO(__size, __short_size) \
+{ \
+ .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
+ .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \
+ .name[KMALLOC_DMA] = "dma-kmalloc-" #__short_size, \
+ .size = __size, \
+}
+#else
+#define INIT_KMALLOC_INFO(__size, __short_size) \
+{ \
+ .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
+ .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \
+ .size = __size, \
+}
+#endif
+
/*
* kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
* kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
* kmalloc-67108864.
*/
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
- {NULL, 0}, {"kmalloc-96", 96},
- {"kmalloc-192", 192}, {"kmalloc-8", 8},
- {"kmalloc-16", 16}, {"kmalloc-32", 32},
- {"kmalloc-64", 64}, {"kmalloc-128", 128},
- {"kmalloc-256", 256}, {"kmalloc-512", 512},
- {"kmalloc-1k", 1024}, {"kmalloc-2k", 2048},
- {"kmalloc-4k", 4096}, {"kmalloc-8k", 8192},
- {"kmalloc-16k", 16384}, {"kmalloc-32k", 32768},
- {"kmalloc-64k", 65536}, {"kmalloc-128k", 131072},
- {"kmalloc-256k", 262144}, {"kmalloc-512k", 524288},
- {"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152},
- {"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608},
- {"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432},
- {"kmalloc-64M", 67108864}
+ INIT_KMALLOC_INFO(0, 0),
+ INIT_KMALLOC_INFO(96, 96),
+ INIT_KMALLOC_INFO(192, 192),
+ INIT_KMALLOC_INFO(8, 8),
+ INIT_KMALLOC_INFO(16, 16),
+ INIT_KMALLOC_INFO(32, 32),
+ INIT_KMALLOC_INFO(64, 64),
+ INIT_KMALLOC_INFO(128, 128),
+ INIT_KMALLOC_INFO(256, 256),
+ INIT_KMALLOC_INFO(512, 512),
+ INIT_KMALLOC_INFO(1024, 1k),
+ INIT_KMALLOC_INFO(2048, 2k),
+ INIT_KMALLOC_INFO(4096, 4k),
+ INIT_KMALLOC_INFO(8192, 8k),
+ INIT_KMALLOC_INFO(16384, 16k),
+ INIT_KMALLOC_INFO(32768, 32k),
+ INIT_KMALLOC_INFO(65536, 64k),
+ INIT_KMALLOC_INFO(131072, 128k),
+ INIT_KMALLOC_INFO(262144, 256k),
+ INIT_KMALLOC_INFO(524288, 512k),
+ INIT_KMALLOC_INFO(1048576, 1M),
+ INIT_KMALLOC_INFO(2097152, 2M),
+ INIT_KMALLOC_INFO(4194304, 4M),
+ INIT_KMALLOC_INFO(8388608, 8M),
+ INIT_KMALLOC_INFO(16777216, 16M),
+ INIT_KMALLOC_INFO(33554432, 32M),
+ INIT_KMALLOC_INFO(67108864, 64M)
};
/*
@@ -1208,36 +1238,14 @@ void __init setup_kmalloc_cache_index_table(void)
}
}
-static const char *
-kmalloc_cache_name(const char *prefix, unsigned int size)
-{
-
- static const char units[3] = "\0kM";
- int idx = 0;
-
- while (size >= 1024 && (size % 1024 == 0)) {
- size /= 1024;
- idx++;
- }
-
- return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]);
-}
-
static void __init
-new_kmalloc_cache(int idx, int type, slab_flags_t flags)
+new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
{
- const char *name;
-
- if (type == KMALLOC_RECLAIM) {
+ if (type == KMALLOC_RECLAIM)
flags |= SLAB_RECLAIM_ACCOUNT;
- name = kmalloc_cache_name("kmalloc-rcl",
- kmalloc_info[idx].size);
- BUG_ON(!name);
- } else {
- name = kmalloc_info[idx].name;
- }
- kmalloc_caches[type][idx] = create_kmalloc_cache(name,
+ kmalloc_caches[type][idx] = create_kmalloc_cache(
+ kmalloc_info[idx].name[type],
kmalloc_info[idx].size, flags, 0,
kmalloc_info[idx].size);
}
@@ -1249,7 +1257,8 @@ new_kmalloc_cache(int idx, int type, slab_flags_t flags)
*/
void __init create_kmalloc_caches(slab_flags_t flags)
{
- int i, type;
+ int i;
+ enum kmalloc_cache_type type;
for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
@@ -1278,12 +1287,10 @@ void __init create_kmalloc_caches(slab_flags_t flags)
struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];
if (s) {
- unsigned int size = kmalloc_size(i);
- const char *n = kmalloc_cache_name("dma-kmalloc", size);
-
- BUG_ON(!n);
kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
- n, size, SLAB_CACHE_DMA | flags, 0, 0);
+ kmalloc_info[i].name[KMALLOC_DMA],
+ kmalloc_info[i].size,
+ SLAB_CACHE_DMA | flags, 0, 0);
}
}
#endif
diff --git a/mm/slub.c b/mm/slub.c
index b25c807a111f..249e4c8be66a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -93,9 +93,7 @@
* minimal so we rely on the page allocators per cpu caches for
* fast frees and allocs.
*
- * Overloading of page flags that are otherwise used for LRU management.
- *
- * PageActive The slab is frozen and exempt from list processing.
+ * page->frozen The slab is frozen and exempt from list processing.
* This means that the slab is dedicated to a purpose
* such as satisfying allocations for a specific
* processor. Objects may be freed in the slab while
@@ -111,7 +109,7 @@
* free objects in addition to the regular freelist
* that requires the slab lock.
*
- * PageError Slab requires special handling due to debug
+ * SLAB_DEBUG_FLAGS Slab requires special handling due to debug
* options set. This moves slab handling out of
* the fast path and disables lockless freelists.
*/
@@ -736,6 +734,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
{
u8 *fault;
u8 *end;
+ u8 *addr = page_address(page);
metadata_access_enable();
fault = memchr_inv(start, value, bytes);
@@ -748,8 +747,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
end--;
slab_bug(s, "%s overwritten", what);
- pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
- fault, end - 1, fault[0], value);
+ pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+ fault, end - 1, fault - addr,
+ fault[0], value);
print_trailer(s, page, object);
restore_bytes(s, what, value, fault, end);
@@ -844,7 +844,8 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
while (end > fault && end[-1] == POISON_INUSE)
end--;
- slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
+ slab_err(s, page, "Padding overwritten. 0x%p-0x%p @offset=%tu",
+ fault, end - 1, fault - start);
print_section(KERN_ERR, "Padding ", pad, remainder);
restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
diff --git a/mm/sparse.c b/mm/sparse.c
index f6891c1992b1..a04c754ec99c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -483,7 +483,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid)
phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
sparsemap_buf =
- memblock_alloc_try_nid_raw(size, PAGE_SIZE,
+ memblock_alloc_exact_nid_raw(size, PAGE_SIZE,
addr,
MEMBLOCK_ALLOC_ACCESSIBLE, nid);
sparsemap_buf_end = sparsemap_buf + size;
diff --git a/mm/swap.c b/mm/swap.c
index 38c3fa4308e2..5341ae93861f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -373,9 +373,16 @@ static void __lru_cache_activate_page(struct page *page)
void mark_page_accessed(struct page *page)
{
page = compound_head(page);
- if (!PageActive(page) && !PageUnevictable(page) &&
- PageReferenced(page)) {
+ if (!PageReferenced(page)) {
+ SetPageReferenced(page);
+ } else if (PageUnevictable(page)) {
+ /*
+ * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
+ * this list is never rotated or maintained, so marking an
+ * evictable page accessed has no effect.
+ */
+ } else if (!PageActive(page)) {
/*
* If the page is on the LRU, queue it for activation via
* activate_page_pvecs. Otherwise, assume the page is on a
@@ -389,8 +396,6 @@ void mark_page_accessed(struct page *page)
ClearPageReferenced(page);
if (page_is_file_cache(page))
workingset_activation(page);
- } else if (!PageReferenced(page)) {
- SetPageReferenced(page);
}
if (page_is_idle(page))
clear_page_idle(page);
@@ -708,9 +713,10 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
*/
void lru_add_drain_all(void)
{
+ static seqcount_t seqcount = SEQCNT_ZERO(seqcount);
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
- int cpu;
+ int cpu, seq;
/*
* Make sure nobody triggers this path before mm_percpu_wq is fully
@@ -719,7 +725,19 @@ void lru_add_drain_all(void)
if (WARN_ON(!mm_percpu_wq))
return;
+ seq = raw_read_seqcount_latch(&seqcount);
+
mutex_lock(&lock);
+
+ /*
+ * Piggyback on drain started and finished while we waited for lock:
+ * all pages pended at the time of our enter were drained from vectors.
+ */
+ if (__read_seqcount_retry(&seqcount, seq))
+ goto done;
+
+ raw_write_seqcount_latch(&seqcount);
+
cpumask_clear(&has_work);
for_each_online_cpu(cpu) {
@@ -740,6 +758,7 @@ void lru_add_drain_all(void)
for_each_cpu(cpu, &has_work)
flush_work(&per_cpu(lru_add_drain_work, cpu));
+done:
mutex_unlock(&lock);
}
#else
diff --git a/mm/swapfile.c b/mm/swapfile.c
index dab43523afdd..bb3261d45b6a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2887,6 +2887,13 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
error = set_blocksize(p->bdev, PAGE_SIZE);
if (error < 0)
return error;
+ /*
+ * Zoned block devices contain zones that have a sequential
+ * write only restriction. Hence zoned block devices are not
+ * suitable for swapping. Disallow them here.
+ */
+ if (blk_queue_is_zoned(p->bdev->bd_queue))
+ return -EINVAL;
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
p->bdev = inode->i_sb->s_bdev;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index c7ae74ce5ff3..1b0d7abad1d4 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -18,6 +18,36 @@
#include <asm/tlbflush.h>
#include "internal.h"
+static __always_inline
+struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len)
+{
+ /*
+ * Make sure that the dst range is both valid and fully within a
+ * single existing vma.
+ */
+ struct vm_area_struct *dst_vma;
+
+ dst_vma = find_vma(dst_mm, dst_start);
+ if (!dst_vma)
+ return NULL;
+
+ if (dst_start < dst_vma->vm_start ||
+ dst_start + len > dst_vma->vm_end)
+ return NULL;
+
+ /*
+ * Check the vma is registered in uffd, this is required to
+ * enforce the VM_MAYWRITE check done at uffd registration
+ * time.
+ */
+ if (!dst_vma->vm_userfaultfd_ctx.ctx)
+ return NULL;
+
+ return dst_vma;
+}
+
static int mcopy_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
@@ -60,7 +90,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
/*
* The memory barrier inside __SetPageUptodate makes sure that
- * preceeding stores to the page contents become visible before
+ * preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
__SetPageUptodate(page);
@@ -184,7 +214,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
unsigned long src_addr, dst_addr;
long copied;
struct page *page;
- struct hstate *h;
unsigned long vma_hpagesize;
pgoff_t idx;
u32 hash;
@@ -221,20 +250,9 @@ retry:
*/
if (!dst_vma) {
err = -ENOENT;
- dst_vma = find_vma(dst_mm, dst_start);
+ dst_vma = find_dst_vma(dst_mm, dst_start, len);
if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
goto out_unlock;
- /*
- * Check the vma is registered in uffd, this is
- * required to enforce the VM_MAYWRITE check done at
- * uffd registration time.
- */
- if (!dst_vma->vm_userfaultfd_ctx.ctx)
- goto out_unlock;
-
- if (dst_start < dst_vma->vm_start ||
- dst_start + len > dst_vma->vm_end)
- goto out_unlock;
err = -EINVAL;
if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
@@ -243,10 +261,6 @@ retry:
vm_shared = dst_vma->vm_flags & VM_SHARED;
}
- if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
- (len - copied) & (vma_hpagesize - 1)))
- goto out_unlock;
-
/*
* If not shared, ensure the dst_vma has a anon_vma.
*/
@@ -256,24 +270,21 @@ retry:
goto out_unlock;
}
- h = hstate_vma(dst_vma);
-
while (src_addr < src_start + len) {
pte_t dst_pteval;
BUG_ON(dst_addr >= dst_start + len);
- VM_BUG_ON(dst_addr & ~huge_page_mask(h));
/*
* Serialize via hugetlb_fault_mutex
*/
idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
- hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
err = -ENOMEM;
- dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
+ dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize);
if (!dst_pte) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
goto out_unlock;
@@ -300,7 +311,8 @@ retry:
err = copy_huge_page_from_user(page,
(const void __user *)src_addr,
- pages_per_huge_page(h), true);
+ vma_hpagesize / PAGE_SIZE,
+ true);
if (unlikely(err)) {
err = -EFAULT;
goto out;
@@ -475,20 +487,9 @@ retry:
* both valid and fully within a single existing vma.
*/
err = -ENOENT;
- dst_vma = find_vma(dst_mm, dst_start);
+ dst_vma = find_dst_vma(dst_mm, dst_start, len);
if (!dst_vma)
goto out_unlock;
- /*
- * Check the vma is registered in uffd, this is required to
- * enforce the VM_MAYWRITE check done at uffd registration
- * time.
- */
- if (!dst_vma->vm_userfaultfd_ctx.ctx)
- goto out_unlock;
-
- if (dst_start < dst_vma->vm_start ||
- dst_start + len > dst_vma->vm_end)
- goto out_unlock;
err = -EINVAL;
/*
diff --git a/mm/util.c b/mm/util.c
index 3ad6db9a722e..988d11e6c17c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -271,7 +271,7 @@ void *memdup_user_nul(const void __user *src, size_t len)
EXPORT_SYMBOL(memdup_user_nul);
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node *rb_parent)
+ struct vm_area_struct *prev)
{
struct vm_area_struct *next;
@@ -280,18 +280,28 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
next = prev->vm_next;
prev->vm_next = vma;
} else {
+ next = mm->mmap;
mm->mmap = vma;
- if (rb_parent)
- next = rb_entry(rb_parent,
- struct vm_area_struct, vm_rb);
- else
- next = NULL;
}
vma->vm_next = next;
if (next)
next->vm_prev = vma;
}
+void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ struct vm_area_struct *prev, *next;
+
+ next = vma->vm_next;
+ prev = vma->vm_prev;
+ if (prev)
+ prev->vm_next = next;
+ else
+ mm->mmap = next;
+ if (next)
+ next->vm_prev = prev;
+}
+
/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(struct vm_area_struct *vma)
{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a3c70e275f4e..f48f64c8d200 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -331,6 +331,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
static DEFINE_SPINLOCK(vmap_area_lock);
+static DEFINE_SPINLOCK(free_vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
static LLIST_HEAD(vmap_purge_list);
@@ -968,6 +969,19 @@ adjust_va_to_fit_type(struct vmap_area *va,
* There are a few exceptions though, as an example it is
* a first allocation (early boot up) when we have "one"
* big free space that has to be split.
+ *
+ * Also we can hit this path in case of regular "vmap"
+ * allocations, if "this" current CPU was not preloaded.
+ * See the comment in alloc_vmap_area() why. If so, then
+ * GFP_NOWAIT is used instead to get an extra object for
+ * split purpose. That is rare and most time does not
+ * occur.
+ *
+ * What happens if an allocation gets failed. Basically,
+ * an "overflow" path is triggered to purge lazily freed
+ * areas to free some memory, then, the "retry" path is
+ * triggered to repeat one more time. See more details
+ * in alloc_vmap_area() function.
*/
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
if (!lva)
@@ -1063,9 +1077,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
return ERR_PTR(-EBUSY);
might_sleep();
+ gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
- va = kmem_cache_alloc_node(vmap_area_cachep,
- gfp_mask & GFP_RECLAIM_MASK, node);
+ va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
@@ -1073,49 +1087,55 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
* Only scan the relevant parts containing pointers to other objects
* to avoid false negatives.
*/
- kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
+ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
retry:
/*
- * Preload this CPU with one extra vmap_area object to ensure
- * that we have it available when fit type of free area is
- * NE_FIT_TYPE.
+ * Preload this CPU with one extra vmap_area object. It is used
+ * when fit type of free area is NE_FIT_TYPE. Please note, it
+ * does not guarantee that an allocation occurs on a CPU that
+ * is preloaded, instead we minimize the case when it is not.
+ * It can happen because of cpu migration, because there is a
+ * race until the below spinlock is taken.
*
* The preload is done in non-atomic context, thus it allows us
* to use more permissive allocation masks to be more stable under
- * low memory condition and high memory pressure.
+ * low memory condition and high memory pressure. In rare case,
+ * if not preloaded, GFP_NOWAIT is used.
*
- * Even if it fails we do not really care about that. Just proceed
- * as it is. "overflow" path will refill the cache we allocate from.
+ * Set "pva" to NULL here, because of "retry" path.
*/
- preempt_disable();
- if (!__this_cpu_read(ne_fit_preload_node)) {
- preempt_enable();
- pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
- preempt_disable();
-
- if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) {
- if (pva)
- kmem_cache_free(vmap_area_cachep, pva);
- }
- }
+ pva = NULL;
- spin_lock(&vmap_area_lock);
- preempt_enable();
+ if (!this_cpu_read(ne_fit_preload_node))
+ /*
+ * Even if it fails we do not really care about that.
+ * Just proceed as it is. If needed "overflow" path
+ * will refill the cache we allocate from.
+ */
+ pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+
+ spin_lock(&free_vmap_area_lock);
+
+ if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
+ kmem_cache_free(vmap_area_cachep, pva);
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
*/
addr = __alloc_vmap_area(size, align, vstart, vend);
+ spin_unlock(&free_vmap_area_lock);
+
if (unlikely(addr == vend))
goto overflow;
va->va_start = addr;
va->va_end = addr + size;
va->vm = NULL;
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ spin_lock(&vmap_area_lock);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
spin_unlock(&vmap_area_lock);
BUG_ON(!IS_ALIGNED(va->va_start, align));
@@ -1125,7 +1145,6 @@ retry:
return va;
overflow:
- spin_unlock(&vmap_area_lock);
if (!purged) {
purge_vmap_area_lazy();
purged = 1;
@@ -1161,28 +1180,25 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
-static void __free_vmap_area(struct vmap_area *va)
+/*
+ * Free a region of KVA allocated by alloc_vmap_area
+ */
+static void free_vmap_area(struct vmap_area *va)
{
/*
* Remove from the busy tree/list.
*/
+ spin_lock(&vmap_area_lock);
unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
/*
- * Merge VA with its neighbors, otherwise just add it.
+ * Insert/Merge it back to the free tree/list.
*/
+ spin_lock(&free_vmap_area_lock);
merge_or_add_vmap_area(va,
&free_vmap_area_root, &free_vmap_area_list);
-}
-
-/*
- * Free a region of KVA allocated by alloc_vmap_area
- */
-static void free_vmap_area(struct vmap_area *va)
-{
- spin_lock(&vmap_area_lock);
- __free_vmap_area(va);
- spin_unlock(&vmap_area_lock);
+ spin_unlock(&free_vmap_area_lock);
}
/*
@@ -1275,7 +1291,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
flush_tlb_kernel_range(start, end);
resched_threshold = lazy_max_pages() << 1;
- spin_lock(&vmap_area_lock);
+ spin_lock(&free_vmap_area_lock);
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
@@ -1290,9 +1306,9 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
atomic_long_sub(nr, &vmap_lazy_nr);
if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
- cond_resched_lock(&vmap_area_lock);
+ cond_resched_lock(&free_vmap_area_lock);
}
- spin_unlock(&vmap_area_lock);
+ spin_unlock(&free_vmap_area_lock);
return true;
}
@@ -2014,15 +2030,21 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
}
EXPORT_SYMBOL_GPL(map_vm_area);
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
+static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+ struct vmap_area *va, unsigned long flags, const void *caller)
{
- spin_lock(&vmap_area_lock);
vm->flags = flags;
vm->addr = (void *)va->va_start;
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->vm = vm;
+}
+
+static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+ unsigned long flags, const void *caller)
+{
+ spin_lock(&vmap_area_lock);
+ setup_vmalloc_vm_locked(vm, va, flags, caller);
spin_unlock(&vmap_area_lock);
}
@@ -2440,7 +2462,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
goto fail;
}
area->pages[i] = page;
- if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
+ if (gfpflags_allow_blocking(gfp_mask))
cond_resched();
}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
@@ -3262,7 +3284,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
goto err_free;
}
retry:
- spin_lock(&vmap_area_lock);
+ spin_lock(&free_vmap_area_lock);
/* start scanning - we scan from the top, begin with the last area */
area = term_area = last_area;
@@ -3344,29 +3366,38 @@ retry:
va = vas[area];
va->va_start = start;
va->va_end = start + size;
-
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
}
- spin_unlock(&vmap_area_lock);
+ spin_unlock(&free_vmap_area_lock);
/* insert all vm's */
- for (area = 0; area < nr_vms; area++)
- setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
+ spin_lock(&vmap_area_lock);
+ for (area = 0; area < nr_vms; area++) {
+ insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
+
+ setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
+ }
+ spin_unlock(&vmap_area_lock);
kfree(vas);
return vms;
recovery:
- /* Remove previously inserted areas. */
+ /*
+ * Remove previously allocated areas. There is no
+ * need in removing these areas from the busy tree,
+ * because they are inserted only on the final step
+ * and when pcpu_get_vm_areas() is success.
+ */
while (area--) {
- __free_vmap_area(vas[area]);
+ merge_or_add_vmap_area(vas[area],
+ &free_vmap_area_root, &free_vmap_area_list);
vas[area] = NULL;
}
overflow:
- spin_unlock(&vmap_area_lock);
+ spin_unlock(&free_vmap_area_lock);
if (!purged) {
purge_vmap_area_lazy();
purged = true;
@@ -3417,9 +3448,12 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
+ __acquires(&vmap_purge_lock)
__acquires(&vmap_area_lock)
{
+ mutex_lock(&vmap_purge_lock);
spin_lock(&vmap_area_lock);
+
return seq_list_start(&vmap_area_list, *pos);
}
@@ -3429,8 +3463,10 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
}
static void s_stop(struct seq_file *m, void *p)
+ __releases(&vmap_purge_lock)
__releases(&vmap_area_lock)
{
+ mutex_unlock(&vmap_purge_lock);
spin_unlock(&vmap_area_lock);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ee4eecc7e1c2..1154b3a2b637 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -775,7 +775,7 @@ static inline int is_page_cache_freeable(struct page *page)
return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
}
-static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
+static int may_write_to_inode(struct inode *inode)
{
if (current->flags & PF_SWAPWRITE)
return 1;
@@ -823,8 +823,7 @@ typedef enum {
* pageout is called by shrink_page_list() for each dirty page.
* Calls ->writepage().
*/
-static pageout_t pageout(struct page *page, struct address_space *mapping,
- struct scan_control *sc)
+static pageout_t pageout(struct page *page, struct address_space *mapping)
{
/*
* If the page is dirty, only perform writeback if that write
@@ -860,7 +859,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_inode(mapping->host, sc))
+ if (!may_write_to_inode(mapping->host))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
@@ -1394,7 +1393,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* starts and then write it out here.
*/
try_to_unmap_flush_dirty();
- switch (pageout(page, mapping, sc)) {
+ switch (pageout(page, mapping)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
@@ -2302,8 +2301,7 @@ enum scan_balance {
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
- struct scan_control *sc, unsigned long *nr,
- unsigned long *lru_pages)
+ struct scan_control *sc, unsigned long *nr)
{
int swappiness = mem_cgroup_swappiness(memcg);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
@@ -2454,7 +2452,6 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
fraction[1] = fp;
denominator = ap + fp + 1;
out:
- *lru_pages = 0;
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
unsigned long lruvec_size;
@@ -2549,7 +2546,6 @@ out:
BUG();
}
- *lru_pages += lruvec_size;
nr[lru] = scan;
}
}
@@ -2558,7 +2554,7 @@ out:
* This is a basic per-node page freer. Used by both kswapd and direct reclaim.
*/
static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
- struct scan_control *sc, unsigned long *lru_pages)
+ struct scan_control *sc)
{
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
unsigned long nr[NR_LRU_LISTS];
@@ -2570,7 +2566,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
struct blk_plug plug;
bool scan_adjusted;
- get_scan_count(lruvec, memcg, sc, nr, lru_pages);
+ get_scan_count(lruvec, memcg, sc, nr);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
@@ -2758,7 +2754,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
- unsigned long node_lru_pages = 0;
struct mem_cgroup *memcg;
memset(&sc->nr, 0, sizeof(sc->nr));
@@ -2768,7 +2763,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
memcg = mem_cgroup_iter(root, NULL, NULL);
do {
- unsigned long lru_pages;
unsigned long reclaimed;
unsigned long scanned;
@@ -2805,8 +2799,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;
- shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
- node_lru_pages += lru_pages;
+ shrink_node_memcg(pgdat, memcg, sc);
shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
sc->priority);
@@ -3317,15 +3310,16 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
.reclaim_idx = MAX_NR_ZONES - 1,
.may_swap = !noswap,
};
- unsigned long lru_pages;
WARN_ON_ONCE(!current->reclaim_state);
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
- sc.gfp_mask);
+ trace_mm_vmscan_memcg_softlimit_reclaim_begin(
+ cgroup_ino(memcg->css.cgroup),
+ sc.order,
+ sc.gfp_mask);
/*
* NOTE: Although we can get the priority field, using it
@@ -3334,9 +3328,11 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
+ shrink_node_memcg(pgdat, memcg, &sc);
- trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+ trace_mm_vmscan_memcg_softlimit_reclaim_end(
+ cgroup_ino(memcg->css.cgroup),
+ sc.nr_reclaimed);
*nr_scanned = sc.nr_scanned;
@@ -3375,7 +3371,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
- trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
+ trace_mm_vmscan_memcg_reclaim_begin(
+ cgroup_ino(memcg->css.cgroup),
+ 0, sc.gfp_mask);
psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
@@ -3385,7 +3383,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
- trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+ trace_mm_vmscan_memcg_reclaim_end(
+ cgroup_ino(memcg->css.cgroup),
+ nr_reclaimed);
set_task_reclaim_state(current, NULL);
return nr_reclaimed;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6afc892a148a..c37e07a73c30 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1084,7 +1084,8 @@ int fragmentation_index(struct zone *zone, unsigned int order)
}
#endif
-#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
+#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
+ defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
#ifdef CONFIG_ZONE_DMA
#define TEXT_FOR_DMA(xx) xx "_dma",
#else
@@ -1134,7 +1135,7 @@ const char * const vmstat_text[] = {
"numa_other",
#endif
- /* Node-based counters */
+ /* enum node_stat_item counters */
"nr_inactive_anon",
"nr_active_anon",
"nr_inactive_file",
@@ -1172,7 +1173,7 @@ const char * const vmstat_text[] = {
"nr_dirty_threshold",
"nr_dirty_background_threshold",
-#ifdef CONFIG_VM_EVENT_COUNTERS
+#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
/* enum vm_event_item counters */
"pgpgin",
"pgpgout",
@@ -1291,9 +1292,9 @@ const char * const vmstat_text[] = {
"swap_ra",
"swap_ra_hit",
#endif
-#endif /* CONFIG_VM_EVENTS_COUNTERS */
+#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
};
-#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
+#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
defined(CONFIG_PROC_FS)
@@ -1373,23 +1374,54 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
pg_data_t *pgdat, struct zone *zone)
{
int order, mtype;
+ unsigned long nfree[MAX_ORDER][MIGRATE_TYPES];
- for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
- seq_printf(m, "Node %4d, zone %8s, type %12s ",
- pgdat->node_id,
- zone->name,
- migratetype_names[mtype]);
- for (order = 0; order < MAX_ORDER; ++order) {
+ lockdep_assert_held(&zone->lock);
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * MIGRATE_MOVABLE is usually the largest one in large memory
+ * systems. We skip iterating that list. Instead, we compute it by
+ * subtracting the total of the rests from free_area->nr_free.
+ */
+ for (order = 0; order < MAX_ORDER; ++order) {
+ unsigned long nr_total = 0;
+ struct free_area *area = &(zone->free_area[order]);
+
+ for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
unsigned long freecount = 0;
- struct free_area *area;
struct list_head *curr;
- area = &(zone->free_area[order]);
-
+ if (mtype == MIGRATE_MOVABLE)
+ continue;
list_for_each(curr, &area->free_list[mtype])
freecount++;
- seq_printf(m, "%6lu ", freecount);
+ nfree[order][mtype] = freecount;
+ nr_total += freecount;
+ }
+ nfree[order][MIGRATE_MOVABLE] = area->nr_free - nr_total;
+
+ /*
+ * If we have already iterated more than 64k of list
+ * entries, we might have hold the zone lock for too long.
+ * Temporarily release the lock and reschedule before
+ * continuing so that other lock waiters have a chance
+ * to run.
+ */
+ if (nr_total > (1 << 16)) {
+ spin_unlock_irq(&zone->lock);
+ cond_resched();
+ spin_lock_irq(&zone->lock);
}
+ }
+
+ for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
+ seq_printf(m, "Node %4d, zone %8s, type %12s ",
+ pgdat->node_id,
+ zone->name,
+ migratetype_names[mtype]);
+ for (order = 0; order < MAX_ORDER; ++order)
+ seq_printf(m, "%6lu ", nfree[order][mtype]);
seq_putc(m, '\n');
}
}
@@ -1547,10 +1579,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
if (is_zone_first_populated(pgdat, zone)) {
seq_printf(m, "\n per-node stats");
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
- seq_printf(m, "\n %-12s %lu",
- vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
- NR_VM_NUMA_STAT_ITEMS],
- node_page_state(pgdat, i));
+ seq_printf(m, "\n %-12s %lu", node_stat_name(i),
+ node_page_state(pgdat, i));
}
}
seq_printf(m,
@@ -1583,14 +1613,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
}
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- seq_printf(m, "\n %-12s %lu", vmstat_text[i],
- zone_page_state(zone, i));
+ seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
+ zone_page_state(zone, i));
#ifdef CONFIG_NUMA
for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
- seq_printf(m, "\n %-12s %lu",
- vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
- zone_numa_state_snapshot(zone, i));
+ seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
+ zone_numa_state_snapshot(zone, i));
#endif
seq_printf(m, "\n pagesets");
@@ -1641,31 +1670,23 @@ static const struct seq_operations zoneinfo_op = {
.show = zoneinfo_show,
};
-enum writeback_stat_item {
- NR_DIRTY_THRESHOLD,
- NR_DIRTY_BG_THRESHOLD,
- NR_VM_WRITEBACK_STAT_ITEMS,
-};
+#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
+ NR_VM_NUMA_STAT_ITEMS + \
+ NR_VM_NODE_STAT_ITEMS + \
+ NR_VM_WRITEBACK_STAT_ITEMS + \
+ (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
+ NR_VM_EVENT_ITEMS : 0))
static void *vmstat_start(struct seq_file *m, loff_t *pos)
{
unsigned long *v;
- int i, stat_items_size;
+ int i;
- if (*pos >= ARRAY_SIZE(vmstat_text))
+ if (*pos >= NR_VMSTAT_ITEMS)
return NULL;
- stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
- NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) +
- NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
- NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
-
-#ifdef CONFIG_VM_EVENT_COUNTERS
- stat_items_size += sizeof(struct vm_event_state);
-#endif
- BUILD_BUG_ON(stat_items_size !=
- ARRAY_SIZE(vmstat_text) * sizeof(unsigned long));
- v = kmalloc(stat_items_size, GFP_KERNEL);
+ BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
+ v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
m->private = v;
if (!v)
return ERR_PTR(-ENOMEM);
@@ -1698,7 +1719,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
{
(*pos)++;
- if (*pos >= ARRAY_SIZE(vmstat_text))
+ if (*pos >= NR_VMSTAT_ITEMS)
return NULL;
return (unsigned long *)m->private + *pos;
}
@@ -1764,7 +1785,7 @@ int vmstat_refresh(struct ctl_table *table, int write,
val = atomic_long_read(&vm_zone_stat[i]);
if (val < 0) {
pr_warn("%s: %s %ld\n",
- __func__, vmstat_text[i], val);
+ __func__, zone_stat_name(i), val);
err = -EINVAL;
}
}
@@ -1773,7 +1794,7 @@ int vmstat_refresh(struct ctl_table *table, int write,
val = atomic_long_read(&vm_numa_stat[i]);
if (val < 0) {
pr_warn("%s: %s %ld\n",
- __func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val);
+ __func__, numa_stat_name(i), val);
err = -EINVAL;
}
}
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 6d3d3f698ebb..d48d0ec3bcdd 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -41,6 +41,7 @@
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/rwlock.h>
#include <linux/zpool.h>
#include <linux/magic.h>
@@ -90,6 +91,7 @@ struct z3fold_buddy_slots {
*/
unsigned long slot[BUDDY_MASK + 1];
unsigned long pool; /* back link + flags */
+ rwlock_t lock;
};
#define HANDLE_FLAG_MASK (0x03)
@@ -124,6 +126,7 @@ struct z3fold_header {
unsigned short start_middle;
unsigned short first_num:2;
unsigned short mapped_count:2;
+ unsigned short foreign_handles:2;
};
/**
@@ -178,6 +181,19 @@ enum z3fold_page_flags {
PAGE_CLAIMED, /* by either reclaim or free */
};
+/*
+ * handle flags, go under HANDLE_FLAG_MASK
+ */
+enum z3fold_handle_flags {
+ HANDLES_ORPHANED = 0,
+};
+
+/*
+ * Forward declarations
+ */
+static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool);
+static void compact_page_work(struct work_struct *w);
+
/*****************
* Helpers
*****************/
@@ -191,8 +207,6 @@ static int size_to_chunks(size_t size)
#define for_each_unbuddied_list(_iter, _begin) \
for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
-static void compact_page_work(struct work_struct *w);
-
static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
gfp_t gfp)
{
@@ -204,6 +218,7 @@ static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
if (slots) {
memset(slots->slot, 0, sizeof(slots->slot));
slots->pool = (unsigned long)pool;
+ rwlock_init(&slots->lock);
}
return slots;
@@ -219,27 +234,108 @@ static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
}
+/* Lock a z3fold page */
+static inline void z3fold_page_lock(struct z3fold_header *zhdr)
+{
+ spin_lock(&zhdr->page_lock);
+}
+
+/* Try to lock a z3fold page */
+static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
+{
+ return spin_trylock(&zhdr->page_lock);
+}
+
+/* Unlock a z3fold page */
+static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
+{
+ spin_unlock(&zhdr->page_lock);
+}
+
+
+static inline struct z3fold_header *__get_z3fold_header(unsigned long handle,
+ bool lock)
+{
+ struct z3fold_buddy_slots *slots;
+ struct z3fold_header *zhdr;
+ int locked = 0;
+
+ if (!(handle & (1 << PAGE_HEADLESS))) {
+ slots = handle_to_slots(handle);
+ do {
+ unsigned long addr;
+
+ read_lock(&slots->lock);
+ addr = *(unsigned long *)handle;
+ zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
+ if (lock)
+ locked = z3fold_page_trylock(zhdr);
+ read_unlock(&slots->lock);
+ if (locked)
+ break;
+ cpu_relax();
+ } while (lock);
+ } else {
+ zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
+ }
+
+ return zhdr;
+}
+
+/* Returns the z3fold page where a given handle is stored */
+static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
+{
+ return __get_z3fold_header(h, false);
+}
+
+/* return locked z3fold page if it's not headless */
+static inline struct z3fold_header *get_z3fold_header(unsigned long h)
+{
+ return __get_z3fold_header(h, true);
+}
+
+static inline void put_z3fold_header(struct z3fold_header *zhdr)
+{
+ struct page *page = virt_to_page(zhdr);
+
+ if (!test_bit(PAGE_HEADLESS, &page->private))
+ z3fold_page_unlock(zhdr);
+}
+
static inline void free_handle(unsigned long handle)
{
struct z3fold_buddy_slots *slots;
+ struct z3fold_header *zhdr;
int i;
bool is_free;
if (handle & (1 << PAGE_HEADLESS))
return;
- WARN_ON(*(unsigned long *)handle == 0);
- *(unsigned long *)handle = 0;
+ if (WARN_ON(*(unsigned long *)handle == 0))
+ return;
+
+ zhdr = handle_to_z3fold_header(handle);
slots = handle_to_slots(handle);
+ write_lock(&slots->lock);
+ *(unsigned long *)handle = 0;
+ write_unlock(&slots->lock);
+ if (zhdr->slots == slots)
+ return; /* simple case, nothing else to do */
+
+ /* we are freeing a foreign handle if we are here */
+ zhdr->foreign_handles--;
is_free = true;
+ read_lock(&slots->lock);
for (i = 0; i <= BUDDY_MASK; i++) {
if (slots->slot[i]) {
is_free = false;
break;
}
}
+ read_unlock(&slots->lock);
- if (is_free) {
+ if (is_free && test_and_clear_bit(HANDLES_ORPHANED, &slots->pool)) {
struct z3fold_pool *pool = slots_to_pool(slots);
kmem_cache_free(pool->c_handle, slots);
@@ -322,6 +418,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
zhdr->first_num = 0;
zhdr->start_middle = 0;
zhdr->cpu = -1;
+ zhdr->foreign_handles = 0;
zhdr->slots = slots;
zhdr->pool = pool;
INIT_LIST_HEAD(&zhdr->buddy);
@@ -341,24 +438,6 @@ static void free_z3fold_page(struct page *page, bool headless)
__free_page(page);
}
-/* Lock a z3fold page */
-static inline void z3fold_page_lock(struct z3fold_header *zhdr)
-{
- spin_lock(&zhdr->page_lock);
-}
-
-/* Try to lock a z3fold page */
-static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
-{
- return spin_trylock(&zhdr->page_lock);
-}
-
-/* Unlock a z3fold page */
-static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
-{
- spin_unlock(&zhdr->page_lock);
-}
-
/* Helper function to build the index */
static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
{
@@ -389,7 +468,9 @@ static unsigned long __encode_handle(struct z3fold_header *zhdr,
if (bud == LAST)
h |= (zhdr->last_chunks << BUDDY_SHIFT);
+ write_lock(&slots->lock);
slots->slot[idx] = h;
+ write_unlock(&slots->lock);
return (unsigned long)&slots->slot[idx];
}
@@ -398,17 +479,6 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
return __encode_handle(zhdr, zhdr->slots, bud);
}
-/* Returns the z3fold page where a given handle is stored */
-static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
-{
- unsigned long addr = h;
-
- if (!(addr & (1 << PAGE_HEADLESS)))
- addr = *(unsigned long *)h;
-
- return (struct z3fold_header *)(addr & PAGE_MASK);
-}
-
/* only for LAST bud, returns zero otherwise */
static unsigned short handle_to_chunks(unsigned long handle)
{
@@ -442,6 +512,8 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
{
struct page *page = virt_to_page(zhdr);
struct z3fold_pool *pool = zhdr_to_pool(zhdr);
+ bool is_free = true;
+ int i;
WARN_ON(!list_empty(&zhdr->buddy));
set_bit(PAGE_STALE, &page->private);
@@ -450,8 +522,25 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
if (!list_empty(&page->lru))
list_del_init(&page->lru);
spin_unlock(&pool->lock);
+
+ /* If there are no foreign handles, free the handles array */
+ read_lock(&zhdr->slots->lock);
+ for (i = 0; i <= BUDDY_MASK; i++) {
+ if (zhdr->slots->slot[i]) {
+ is_free = false;
+ break;
+ }
+ }
+ read_unlock(&zhdr->slots->lock);
+
+ if (is_free)
+ kmem_cache_free(pool->c_handle, zhdr->slots);
+ else
+ set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
+
if (locked)
z3fold_page_unlock(zhdr);
+
spin_lock(&pool->stale_lock);
list_add(&zhdr->buddy, &pool->stale);
queue_work(pool->release_wq, &pool->work);
@@ -479,6 +568,7 @@ static void release_z3fold_page_locked_list(struct kref *ref)
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
refcount);
struct z3fold_pool *pool = zhdr_to_pool(zhdr);
+
spin_lock(&pool->lock);
list_del_init(&zhdr->buddy);
spin_unlock(&pool->lock);
@@ -559,6 +649,113 @@ static inline void *mchunk_memmove(struct z3fold_header *zhdr,
zhdr->middle_chunks << CHUNK_SHIFT);
}
+static inline bool buddy_single(struct z3fold_header *zhdr)
+{
+ return !((zhdr->first_chunks && zhdr->middle_chunks) ||
+ (zhdr->first_chunks && zhdr->last_chunks) ||
+ (zhdr->middle_chunks && zhdr->last_chunks));
+}
+
+static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
+{
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
+ void *p = zhdr;
+ unsigned long old_handle = 0;
+ size_t sz = 0;
+ struct z3fold_header *new_zhdr = NULL;
+ int first_idx = __idx(zhdr, FIRST);
+ int middle_idx = __idx(zhdr, MIDDLE);
+ int last_idx = __idx(zhdr, LAST);
+
+ /*
+ * No need to protect slots here -- all the slots are "local" and
+ * the page lock is already taken
+ */
+ if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) {
+ p += ZHDR_SIZE_ALIGNED;
+ sz = zhdr->first_chunks << CHUNK_SHIFT;
+ old_handle = (unsigned long)&zhdr->slots->slot[first_idx];
+ } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) {
+ p += zhdr->start_middle << CHUNK_SHIFT;
+ sz = zhdr->middle_chunks << CHUNK_SHIFT;
+ old_handle = (unsigned long)&zhdr->slots->slot[middle_idx];
+ } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) {
+ p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
+ sz = zhdr->last_chunks << CHUNK_SHIFT;
+ old_handle = (unsigned long)&zhdr->slots->slot[last_idx];
+ }
+
+ if (sz > 0) {
+ enum buddy new_bud = HEADLESS;
+ short chunks = size_to_chunks(sz);
+ void *q;
+
+ new_zhdr = __z3fold_alloc(pool, sz, false);
+ if (!new_zhdr)
+ return NULL;
+
+ if (WARN_ON(new_zhdr == zhdr))
+ goto out_fail;
+
+ if (new_zhdr->first_chunks == 0) {
+ if (new_zhdr->middle_chunks != 0 &&
+ chunks >= new_zhdr->start_middle) {
+ new_bud = LAST;
+ } else {
+ new_bud = FIRST;
+ }
+ } else if (new_zhdr->last_chunks == 0) {
+ new_bud = LAST;
+ } else if (new_zhdr->middle_chunks == 0) {
+ new_bud = MIDDLE;
+ }
+ q = new_zhdr;
+ switch (new_bud) {
+ case FIRST:
+ new_zhdr->first_chunks = chunks;
+ q += ZHDR_SIZE_ALIGNED;
+ break;
+ case MIDDLE:
+ new_zhdr->middle_chunks = chunks;
+ new_zhdr->start_middle =
+ new_zhdr->first_chunks + ZHDR_CHUNKS;
+ q += new_zhdr->start_middle << CHUNK_SHIFT;
+ break;
+ case LAST:
+ new_zhdr->last_chunks = chunks;
+ q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT);
+ break;
+ default:
+ goto out_fail;
+ }
+ new_zhdr->foreign_handles++;
+ memcpy(q, p, sz);
+ write_lock(&zhdr->slots->lock);
+ *(unsigned long *)old_handle = (unsigned long)new_zhdr +
+ __idx(new_zhdr, new_bud);
+ if (new_bud == LAST)
+ *(unsigned long *)old_handle |=
+ (new_zhdr->last_chunks << BUDDY_SHIFT);
+ write_unlock(&zhdr->slots->lock);
+ add_to_unbuddied(pool, new_zhdr);
+ z3fold_page_unlock(new_zhdr);
+ }
+
+ return new_zhdr;
+
+out_fail:
+ if (new_zhdr) {
+ if (kref_put(&new_zhdr->refcount, release_z3fold_page_locked))
+ atomic64_dec(&pool->pages_nr);
+ else {
+ add_to_unbuddied(pool, new_zhdr);
+ z3fold_page_unlock(new_zhdr);
+ }
+ }
+ return NULL;
+
+}
+
#define BIG_CHUNK_GAP 3
/* Has to be called with lock held */
static int z3fold_compact_page(struct z3fold_header *zhdr)
@@ -638,6 +835,15 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
return;
}
+ if (!zhdr->foreign_handles && buddy_single(zhdr) &&
+ compact_single_buddy(zhdr)) {
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
+ atomic64_dec(&pool->pages_nr);
+ else
+ z3fold_page_unlock(zhdr);
+ return;
+ }
+
z3fold_compact_page(zhdr);
add_to_unbuddied(pool, zhdr);
z3fold_page_unlock(zhdr);
@@ -690,7 +896,8 @@ lookup:
spin_unlock(&pool->lock);
page = virt_to_page(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private)) {
+ if (test_bit(NEEDS_COMPACTING, &page->private) ||
+ test_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
zhdr = NULL;
put_cpu_ptr(pool->unbuddied);
@@ -734,7 +941,8 @@ lookup:
spin_unlock(&pool->lock);
page = virt_to_page(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private)) {
+ if (test_bit(NEEDS_COMPACTING, &page->private) ||
+ test_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
zhdr = NULL;
if (can_sleep)
@@ -1000,7 +1208,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
enum buddy bud;
bool page_claimed;
- zhdr = handle_to_z3fold_header(handle);
+ zhdr = get_z3fold_header(handle);
page = virt_to_page(zhdr);
page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
@@ -1014,6 +1222,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
spin_lock(&pool->lock);
list_del(&page->lru);
spin_unlock(&pool->lock);
+ put_z3fold_header(zhdr);
free_z3fold_page(page, true);
atomic64_dec(&pool->pages_nr);
}
@@ -1021,7 +1230,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
}
/* Non-headless case */
- z3fold_page_lock(zhdr);
bud = handle_to_buddy(handle);
switch (bud) {
@@ -1037,11 +1245,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
default:
pr_err("%s: unknown bud %d\n", __func__, bud);
WARN_ON(1);
- z3fold_page_unlock(zhdr);
+ put_z3fold_header(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
return;
}
- free_handle(handle);
if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
atomic64_dec(&pool->pages_nr);
return;
@@ -1051,9 +1259,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
z3fold_page_unlock(zhdr);
return;
}
+ free_handle(handle);
if (unlikely(PageIsolated(page)) ||
test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
- z3fold_page_unlock(zhdr);
+ put_z3fold_header(zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
return;
}
@@ -1063,14 +1272,14 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
spin_unlock(&pool->lock);
zhdr->cpu = -1;
kref_get(&zhdr->refcount);
- do_compact_page(zhdr, true);
clear_bit(PAGE_CLAIMED, &page->private);
+ do_compact_page(zhdr, true);
return;
}
kref_get(&zhdr->refcount);
- queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
+ queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
+ put_z3fold_header(zhdr);
}
/**
@@ -1111,11 +1320,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
*/
static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
{
- int i, ret = 0;
+ int i, ret = -1;
struct z3fold_header *zhdr = NULL;
struct page *page = NULL;
struct list_head *pos;
- struct z3fold_buddy_slots slots;
unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
spin_lock(&pool->lock);
@@ -1153,6 +1361,12 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
zhdr = NULL;
continue; /* can't evict at this point */
}
+ if (zhdr->foreign_handles) {
+ clear_bit(PAGE_CLAIMED, &page->private);
+ z3fold_page_unlock(zhdr);
+ zhdr = NULL;
+ continue; /* can't evict such page */
+ }
kref_get(&zhdr->refcount);
list_del_init(&zhdr->buddy);
zhdr->cpu = -1;
@@ -1176,39 +1390,38 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
last_handle = 0;
middle_handle = 0;
if (zhdr->first_chunks)
- first_handle = __encode_handle(zhdr, &slots,
- FIRST);
+ first_handle = encode_handle(zhdr, FIRST);
if (zhdr->middle_chunks)
- middle_handle = __encode_handle(zhdr, &slots,
- MIDDLE);
+ middle_handle = encode_handle(zhdr, MIDDLE);
if (zhdr->last_chunks)
- last_handle = __encode_handle(zhdr, &slots,
- LAST);
+ last_handle = encode_handle(zhdr, LAST);
/*
* it's safe to unlock here because we hold a
* reference to this page
*/
z3fold_page_unlock(zhdr);
} else {
- first_handle = __encode_handle(zhdr, &slots, HEADLESS);
+ first_handle = encode_handle(zhdr, HEADLESS);
last_handle = middle_handle = 0;
}
-
/* Issue the eviction callback(s) */
if (middle_handle) {
ret = pool->ops->evict(pool, middle_handle);
if (ret)
goto next;
+ free_handle(middle_handle);
}
if (first_handle) {
ret = pool->ops->evict(pool, first_handle);
if (ret)
goto next;
+ free_handle(first_handle);
}
if (last_handle) {
ret = pool->ops->evict(pool, last_handle);
if (ret)
goto next;
+ free_handle(last_handle);
}
next:
if (test_bit(PAGE_HEADLESS, &page->private)) {
@@ -1264,14 +1477,13 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
void *addr;
enum buddy buddy;
- zhdr = handle_to_z3fold_header(handle);
+ zhdr = get_z3fold_header(handle);
addr = zhdr;
page = virt_to_page(zhdr);
if (test_bit(PAGE_HEADLESS, &page->private))
goto out;
- z3fold_page_lock(zhdr);
buddy = handle_to_buddy(handle);
switch (buddy) {
case FIRST:
@@ -1293,8 +1505,8 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
if (addr)
zhdr->mapped_count++;
- z3fold_page_unlock(zhdr);
out:
+ put_z3fold_header(zhdr);
return addr;
}
@@ -1309,18 +1521,17 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
struct page *page;
enum buddy buddy;
- zhdr = handle_to_z3fold_header(handle);
+ zhdr = get_z3fold_header(handle);
page = virt_to_page(zhdr);
if (test_bit(PAGE_HEADLESS, &page->private))
return;
- z3fold_page_lock(zhdr);
buddy = handle_to_buddy(handle);
if (buddy == MIDDLE)
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
zhdr->mapped_count--;
- z3fold_page_unlock(zhdr);
+ put_z3fold_header(zhdr);
}
/**
@@ -1352,19 +1563,21 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
test_bit(PAGE_STALE, &page->private))
goto out;
+ if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
+ goto out;
+
pool = zhdr_to_pool(zhdr);
+ spin_lock(&pool->lock);
+ if (!list_empty(&zhdr->buddy))
+ list_del_init(&zhdr->buddy);
+ if (!list_empty(&page->lru))
+ list_del_init(&page->lru);
+ spin_unlock(&pool->lock);
+
+ kref_get(&zhdr->refcount);
+ z3fold_page_unlock(zhdr);
+ return true;
- if (zhdr->mapped_count == 0) {
- kref_get(&zhdr->refcount);
- if (!list_empty(&zhdr->buddy))
- list_del_init(&zhdr->buddy);
- spin_lock(&pool->lock);
- if (!list_empty(&page->lru))
- list_del(&page->lru);
- spin_unlock(&pool->lock);
- z3fold_page_unlock(zhdr);
- return true;
- }
out:
z3fold_page_unlock(zhdr);
return false;
@@ -1387,7 +1600,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
if (!z3fold_page_trylock(zhdr)) {
return -EAGAIN;
}
- if (zhdr->mapped_count != 0) {
+ if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
z3fold_page_unlock(zhdr);
return -EBUSY;
}
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index b71a39ded930..39e14d5edaf1 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -51,7 +51,7 @@ static BLOCKING_NOTIFIER_HEAD(rpc_pipefs_notifier_list);
int rpc_pipefs_notifier_register(struct notifier_block *nb)
{
- return blocking_notifier_chain_cond_register(&rpc_pipefs_notifier_list, nb);
+ return blocking_notifier_chain_register(&rpc_pipefs_notifier_list, nb);
}
EXPORT_SYMBOL_GPL(rpc_pipefs_notifier_register);
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index e9995b880f84..a85d719df1f4 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -607,6 +607,20 @@ foreach my $entry (keys %deprecated_apis) {
}
$deprecated_apis_search = "(?:${deprecated_apis_search})";
+our %deprecated_string_apis = (
+ "strcpy" => "stracpy or strscpy",
+ "strlcpy" => "stracpy or strscpy",
+ "strncpy" => "stracpy or strscpy - for non-NUL-terminated uses, strncpy dest should be __nonstring",
+);
+
+#Create a search pattern for all these strings apis to speed up a loop below
+our $deprecated_string_apis_search = "";
+foreach my $entry (keys %deprecated_string_apis) {
+ $deprecated_string_apis_search .= '|' if ($deprecated_string_apis_search ne "");
+ $deprecated_string_apis_search .= $entry;
+}
+$deprecated_string_apis_search = "(?:${deprecated_string_apis_search})";
+
our $mode_perms_world_writable = qr{
S_IWUGO |
S_IWOTH |
@@ -5038,8 +5052,9 @@ sub process {
$var =~ /[A-Z][a-z]|[a-z][A-Z]/ &&
#Ignore Page<foo> variants
$var !~ /^(?:Clear|Set|TestClear|TestSet|)Page[A-Z]/ &&
-#Ignore SI style variants like nS, mV and dB (ie: max_uV, regulator_min_uA_show)
- $var !~ /^(?:[a-z_]*?)_?[a-z][A-Z](?:_[a-z_]+)?$/ &&
+#Ignore SI style variants like nS, mV and dB
+#(ie: max_uV, regulator_min_uA_show, RANGE_mA_VALUE)
+ $var !~ /^(?:[a-z0-9_]*|[A-Z0-9_]*)?_?[a-z][A-Z](?:_[a-z0-9_]+|_[A-Z0-9_]+)?$/ &&
#Ignore some three character SI units explicitly, like MiB and KHz
$var !~ /^(?:[a-z_]*?)_?(?:[KMGT]iB|[KMGT]?Hz)(?:_[a-z_]+)?$/) {
while ($var =~ m{($Ident)}g) {
@@ -6484,6 +6499,16 @@ sub process {
"Deprecated use of '$deprecated_api', prefer '$new_api' instead\n" . $herecurr);
}
+# check for string deprecated apis
+ if ($line =~ /\b($deprecated_string_apis_search)\b\s*\(/) {
+ my $deprecated_string_api = $1;
+ my $new_api = $deprecated_string_apis{$deprecated_string_api};
+ my $msg_level = \&WARN;
+ $msg_level = \&CHK if ($file);
+ &{$msg_level}("DEPRECATED_API",
+ "Deprecated use of '$deprecated_string_api', prefer '$new_api' instead\n" . $herecurr);
+ }
+
# check for various structs that are normally const (ops, kgdb, device_tree)
# and avoid what seem like struct definitions 'struct foo {'
if ($line !~ /\bconst\b/ &&
diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 5ef59214c555..34085d146fa2 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -26,6 +26,7 @@ my $email = 1;
my $email_usename = 1;
my $email_maintainer = 1;
my $email_reviewer = 1;
+my $email_fixes = 1;
my $email_list = 1;
my $email_moderated_list = 1;
my $email_subscriber_list = 0;
@@ -249,6 +250,7 @@ if (!GetOptions(
'r!' => \$email_reviewer,
'n!' => \$email_usename,
'l!' => \$email_list,
+ 'fixes!' => \$email_fixes,
'moderated!' => \$email_moderated_list,
's!' => \$email_subscriber_list,
'multiline!' => \$output_multiline,
@@ -503,6 +505,7 @@ sub read_mailmap {
## use the filenames on the command line or find the filenames in the patchfiles
my @files = ();
+my @fixes = (); # If a patch description includes Fixes: lines
my @range = ();
my @keyword_tvi = ();
my @file_emails = ();
@@ -568,6 +571,8 @@ foreach my $file (@ARGV) {
my $filename2 = $2;
push(@files, $filename1);
push(@files, $filename2);
+ } elsif (m/^Fixes:\s+([0-9a-fA-F]{6,40})/) {
+ push(@fixes, $1) if ($email_fixes);
} elsif (m/^\+\+\+\s+(\S+)/ or m/^---\s+(\S+)/) {
my $filename = $1;
$filename =~ s@^[^/]*/@@;
@@ -598,6 +603,7 @@ foreach my $file (@ARGV) {
}
@file_emails = uniq(@file_emails);
+@fixes = uniq(@fixes);
my %email_hash_name;
my %email_hash_address;
@@ -612,7 +618,6 @@ my %deduplicate_name_hash = ();
my %deduplicate_address_hash = ();
my @maintainers = get_maintainers();
-
if (@maintainers) {
@maintainers = merge_email(@maintainers);
output(@maintainers);
@@ -927,6 +932,10 @@ sub get_maintainers {
}
}
+ foreach my $fix (@fixes) {
+ vcs_add_commit_signers($fix, "blamed_fixes");
+ }
+
foreach my $email (@email_to, @list_to) {
$email->[0] = deduplicate_email($email->[0]);
}
@@ -1031,6 +1040,7 @@ MAINTAINER field selection options:
--roles => show roles (status:subsystem, git-signer, list, etc...)
--rolestats => show roles and statistics (commits/total_commits, %)
--file-emails => add email addresses found in -f file (default: 0 (off))
+ --fixes => for patches, add signatures of commits with 'Fixes: <commit>' (default: 1 (on))
--scm => print SCM tree(s) if any
--status => print status if any
--subsystem => print subsystem name if any
@@ -1730,6 +1740,32 @@ sub vcs_is_hg {
return $vcs_used == 2;
}
+sub vcs_add_commit_signers {
+ return if (!vcs_exists());
+
+ my ($commit, $desc) = @_;
+ my $commit_count = 0;
+ my $commit_authors_ref;
+ my $commit_signers_ref;
+ my $stats_ref;
+ my @commit_authors = ();
+ my @commit_signers = ();
+ my $cmd;
+
+ $cmd = $VCS_cmds{"find_commit_signers_cmd"};
+ $cmd =~ s/(\$\w+)/$1/eeg; #substitute variables in $cmd
+
+ ($commit_count, $commit_signers_ref, $commit_authors_ref, $stats_ref) = vcs_find_signers($cmd, "");
+ @commit_authors = @{$commit_authors_ref} if defined $commit_authors_ref;
+ @commit_signers = @{$commit_signers_ref} if defined $commit_signers_ref;
+
+ foreach my $signer (@commit_signers) {
+ $signer = deduplicate_email($signer);
+ }
+
+ vcs_assign($desc, 1, @commit_signers);
+}
+
sub interactive_get_maintainers {
my ($list_ref) = @_;
my @list = @$list_ref;
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 2c6f7994a0aa..d65f59669f7c 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -12,6 +12,7 @@ TARGETS += efivarfs
TARGETS += exec
TARGETS += filesystems
TARGETS += filesystems/binderfs
+TARGETS += filesystems/epoll
TARGETS += firmware
TARGETS += ftrace
TARGETS += futex
diff --git a/tools/testing/selftests/filesystems/epoll/.gitignore b/tools/testing/selftests/filesystems/epoll/.gitignore
new file mode 100644
index 000000000000..9ae8db44ec14
--- /dev/null
+++ b/tools/testing/selftests/filesystems/epoll/.gitignore
@@ -0,0 +1 @@
+epoll_wakeup_test
diff --git a/tools/testing/selftests/filesystems/epoll/Makefile b/tools/testing/selftests/filesystems/epoll/Makefile
new file mode 100644
index 000000000000..e62f3d4f68da
--- /dev/null
+++ b/tools/testing/selftests/filesystems/epoll/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += -I../../../../../usr/include/
+LDFLAGS += -lpthread
+TEST_GEN_PROGS := epoll_wakeup_test
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
new file mode 100644
index 000000000000..37a04dab56f0
--- /dev/null
+++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
@@ -0,0 +1,3074 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <poll.h>
+#include <unistd.h>
+#include <signal.h>
+#include <pthread.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include "../../kselftest_harness.h"
+
+struct epoll_mtcontext
+{
+ int efd[3];
+ int sfd[4];
+ int count;
+
+ pthread_t main;
+ pthread_t waiter;
+};
+
+static void signal_handler(int signum)
+{
+}
+
+static void kill_timeout(struct epoll_mtcontext *ctx)
+{
+ usleep(1000000);
+ pthread_kill(ctx->main, SIGUSR1);
+ pthread_kill(ctx->waiter, SIGUSR1);
+}
+
+static void *waiter_entry1a(void *data)
+{
+ struct epoll_event e;
+ struct epoll_mtcontext *ctx = data;
+
+ if (epoll_wait(ctx->efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx->count, 1);
+
+ return NULL;
+}
+
+static void *waiter_entry1ap(void *data)
+{
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext *ctx = data;
+
+ pfd.fd = ctx->efd[0];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx->efd[0], &e, 1, 0) > 0)
+ __sync_fetch_and_add(&ctx->count, 1);
+ }
+
+ return NULL;
+}
+
+static void *waiter_entry1o(void *data)
+{
+ struct epoll_event e;
+ struct epoll_mtcontext *ctx = data;
+
+ if (epoll_wait(ctx->efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_or(&ctx->count, 1);
+
+ return NULL;
+}
+
+static void *waiter_entry1op(void *data)
+{
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext *ctx = data;
+
+ pfd.fd = ctx->efd[0];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx->efd[0], &e, 1, 0) > 0)
+ __sync_fetch_and_or(&ctx->count, 1);
+ }
+
+ return NULL;
+}
+
+static void *waiter_entry2a(void *data)
+{
+ struct epoll_event events[2];
+ struct epoll_mtcontext *ctx = data;
+
+ if (epoll_wait(ctx->efd[0], events, 2, -1) > 0)
+ __sync_fetch_and_add(&ctx->count, 1);
+
+ return NULL;
+}
+
+static void *waiter_entry2ap(void *data)
+{
+ struct pollfd pfd;
+ struct epoll_event events[2];
+ struct epoll_mtcontext *ctx = data;
+
+ pfd.fd = ctx->efd[0];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx->efd[0], events, 2, 0) > 0)
+ __sync_fetch_and_add(&ctx->count, 1);
+ }
+
+ return NULL;
+}
+
+static void *emitter_entry1(void *data)
+{
+ struct epoll_mtcontext *ctx = data;
+
+ usleep(100000);
+ write(ctx->sfd[1], "w", 1);
+
+ kill_timeout(ctx);
+
+ return NULL;
+}
+
+static void *emitter_entry2(void *data)
+{
+ struct epoll_mtcontext *ctx = data;
+
+ usleep(100000);
+ write(ctx->sfd[1], "w", 1);
+ write(ctx->sfd[3], "w", 1);
+
+ kill_timeout(ctx);
+
+ return NULL;
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * | (lt)
+ * s0
+ */
+TEST(epoll1)
+{
+ int efd;
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd, &e, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd, &e, 1, 0), 1);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * | (et)
+ * s0
+ */
+TEST(epoll2)
+{
+ int efd;
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd, &e, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd, &e, 1, 0), 0);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * (lt) / \ (lt)
+ * s0 s2
+ */
+TEST(epoll3)
+{
+ int efd;
+ int sfd[4];
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 2);
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 2);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * (et) / \ (et)
+ * s0 s2
+ */
+TEST(epoll4)
+{
+ int efd;
+ int sfd[4];
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 2);
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 0);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * | (lt)
+ * s0
+ */
+TEST(epoll5)
+{
+ int efd;
+ int sfd[2];
+ struct pollfd pfd;
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ ASSERT_EQ(poll(&pfd, 1, 0), 1);
+ ASSERT_EQ(epoll_wait(efd, &e, 1, 0), 1);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ ASSERT_EQ(poll(&pfd, 1, 0), 1);
+ ASSERT_EQ(epoll_wait(efd, &e, 1, 0), 1);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * | (et)
+ * s0
+ */
+TEST(epoll6)
+{
+ int efd;
+ int sfd[2];
+ struct pollfd pfd;
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ ASSERT_EQ(poll(&pfd, 1, 0), 1);
+ ASSERT_EQ(epoll_wait(efd, &e, 1, 0), 1);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ ASSERT_EQ(poll(&pfd, 1, 0), 0);
+ ASSERT_EQ(epoll_wait(efd, &e, 1, 0), 0);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * (lt) / \ (lt)
+ * s0 s2
+ */
+
+TEST(epoll7)
+{
+ int efd;
+ int sfd[4];
+ struct pollfd pfd;
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 2);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 2);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * (et) / \ (et)
+ * s0 s2
+ */
+TEST(epoll8)
+{
+ int efd;
+ int sfd[4];
+ struct pollfd pfd;
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 2);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 0);
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 0);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * | (lt)
+ * s0
+ */
+TEST(epoll9)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * | (et)
+ * s0
+ */
+TEST(epoll10)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 1);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * (lt) / \ (lt)
+ * s0 s2
+ */
+TEST(epoll11)
+{
+ pthread_t emitter;
+ struct epoll_event events[2];
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[2], events), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry2a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], events, 2, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * (et) / \ (et)
+ * s0 s2
+ */
+TEST(epoll12)
+{
+ pthread_t emitter;
+ struct epoll_event events[2];
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[2], events), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], events, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * | (lt)
+ * s0
+ */
+TEST(epoll13)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * | (et)
+ * s0
+ */
+TEST(epoll14)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 1);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * (lt) / \ (lt)
+ * s0 s2
+ */
+TEST(epoll15)
+{
+ pthread_t emitter;
+ struct epoll_event events[2];
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[2], events), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry2ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], events, 2, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * (et) / \ (et)
+ * s0 s2
+ */
+TEST(epoll16)
+{
+ pthread_t emitter;
+ struct epoll_event events[2];
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[2], events), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], events, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * | (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll17)
+{
+ int efd[2];
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * | (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll18)
+{
+ int efd[2];
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * | (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll19)
+{
+ int efd[2];
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 0);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * | (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll20)
+{
+ int efd[2];
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 0);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * | (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll21)
+{
+ int efd[2];
+ int sfd[2];
+ struct pollfd pfd;
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * | (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll22)
+{
+ int efd[2];
+ int sfd[2];
+ struct pollfd pfd;
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * | (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll23)
+{
+ int efd[2];
+ int sfd[2];
+ struct pollfd pfd;
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 0);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 0);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * | (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll24)
+{
+ int efd[2];
+ int sfd[2];
+ struct pollfd pfd;
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 0);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 0);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * | (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll25)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * | (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll26)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * | (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll27)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 1);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * | (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll28)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 1);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * | (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll29)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * | (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll30)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * | (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll31)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 1);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * | (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll32)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 1);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (ew)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll33)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (ew)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll34)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1o, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (ew)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll35)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (ew)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll36)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1o, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (ew)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll37)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ pfd.fd = ctx.efd[1];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[1], &e, 1, 0) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (ew)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll38)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1o, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ pfd.fd = ctx.efd[1];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[1], &e, 1, 0) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (ew)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll39)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ pfd.fd = ctx.efd[1];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[1], &e, 1, 0) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (ew)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll40)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1o, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ pfd.fd = ctx.efd[1];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[1], &e, 1, 0) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (p)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll41)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (p)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll42)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1op, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (p)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll43)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (p)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll44)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1op, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (p)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll45)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ pfd.fd = ctx.efd[1];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[1], &e, 1, 0) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (p)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll46)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1op, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (p)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll47)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ pfd.fd = ctx.efd[1];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[1], &e, 1, 0) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (p)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll48)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1op, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * (lt) / \ (lt)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll49)
+{
+ int efd[3];
+ int sfd[4];
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ efd[2] = epoll_create(1);
+ ASSERT_GE(efd[2], 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[2], EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 2);
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 2);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(efd[2]);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * (et) / \ (et)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll50)
+{
+ int efd[3];
+ int sfd[4];
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ efd[2] = epoll_create(1);
+ ASSERT_GE(efd[2], 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[2], EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 2);
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 0);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(efd[2]);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * (lt) / \ (lt)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll51)
+{
+ int efd[3];
+ int sfd[4];
+ struct pollfd pfd;
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ efd[2] = epoll_create(1);
+ ASSERT_GE(efd[2], 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[2], EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 2);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 2);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(efd[2]);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * (et) / \ (et)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll52)
+{
+ int efd[3];
+ int sfd[4];
+ struct pollfd pfd;
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ efd[2] = epoll_create(1);
+ ASSERT_GE(efd[2], 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[2], EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 2);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 0);
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 0);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(efd[2]);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * (lt) / \ (lt)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll53)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ ctx.efd[2] = epoll_create(1);
+ ASSERT_GE(ctx.efd[2], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[2], EPOLL_CTL_ADD, ctx.sfd[2], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[2], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.efd[2]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * (et) / \ (et)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll54)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ ctx.efd[2] = epoll_create(1);
+ ASSERT_GE(ctx.efd[2], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[2], EPOLL_CTL_ADD, ctx.sfd[2], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[2], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.efd[2]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * (lt) / \ (lt)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll55)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ ctx.efd[2] = epoll_create(1);
+ ASSERT_GE(ctx.efd[2], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[2], EPOLL_CTL_ADD, ctx.sfd[2], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[2], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.efd[2]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * (et) / \ (et)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll56)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ ctx.efd[2] = epoll_create(1);
+ ASSERT_GE(ctx.efd[2], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[2], EPOLL_CTL_ADD, ctx.sfd[2], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[2], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.efd[2]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (p) \ / (p)
+ * e0
+ * (lt) / \ (lt)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll57)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ ctx.efd[2] = epoll_create(1);
+ ASSERT_GE(ctx.efd[2], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[2], EPOLL_CTL_ADD, ctx.sfd[2], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[2], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ pfd.fd = ctx.efd[0];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[0], &e, 1, 0) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.efd[2]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (p) \ / (p)
+ * e0
+ * (et) / \ (et)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll58)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ ctx.efd[2] = epoll_create(1);
+ ASSERT_GE(ctx.efd[2], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[2], EPOLL_CTL_ADD, ctx.sfd[2], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[2], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ pfd.fd = ctx.efd[0];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[0], &e, 1, 0) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.efd[2]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/vm/config
index 1c0d76cb5adf..93b90a9b1eeb 100644
--- a/tools/testing/selftests/vm/config
+++ b/tools/testing/selftests/vm/config
@@ -1,2 +1,3 @@
CONFIG_SYSVIPC=y
CONFIG_USERFAULTFD=y
+CONFIG_TEST_VMALLOC=m
diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c
index cb3fc09645c4..485cf06ef013 100644
--- a/tools/testing/selftests/vm/gup_benchmark.c
+++ b/tools/testing/selftests/vm/gup_benchmark.c
@@ -71,7 +71,7 @@ int main(int argc, char **argv)
flags |= MAP_SHARED;
break;
case 'H':
- flags |= MAP_HUGETLB;
+ flags |= (MAP_HUGETLB | MAP_ANONYMOUS);
break;
default:
return -1;