summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2019-02-18 17:50:32 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2019-02-18 17:50:32 +1100
commit521ffb22b5e3eefc5f4a44d540acf746d6233cde (patch)
tree49d1315305821316e769260a3db93485540d89ac /kernel
parentf5f8a43b28ac49c9182d5ed86724550bbaeff3e9 (diff)
parentc435ec1e25d37f0778ee0077e4ed1181f38e9d5d (diff)
Merge branch 'akpm-current/current'
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore2
-rw-r--r--kernel/Makefile11
-rw-r--r--kernel/cgroup/cgroup.c107
-rw-r--r--kernel/configs.c42
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/dma/remap.c2
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/gcov/gcc_3_4.c6
-rw-r--r--kernel/hung_task.c1
-rw-r--r--kernel/kcov.c15
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/power/snapshot.c17
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/sched/core.c3
-rw-r--r--kernel/sched/fair.c15
-rw-r--r--kernel/sched/psi.c609
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/sys.c1
-rw-r--r--kernel/sysctl.c101
-rw-r--r--kernel/workqueue.c10
22 files changed, 824 insertions, 157 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index b3097bde4e9c..6e699100872f 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,7 +1,5 @@
#
# Generated files
#
-config_data.h
-config_data.gz
timeconst.h
hz.bc
diff --git a/kernel/Makefile b/kernel/Makefile
index 6aa7543bcdb2..6c57e78817da 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -116,17 +116,8 @@ obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
KASAN_SANITIZE_stackleak.o := n
KCOV_INSTRUMENT_stackleak.o := n
-$(obj)/configs.o: $(obj)/config_data.h
+$(obj)/configs.o: $(obj)/config_data.gz
targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
-
-filechk_ikconfiggz = \
- echo "static const char kernel_config_data[] __used = MAGIC_START"; \
- cat $< | scripts/bin2c; \
- echo "MAGIC_END;"
-
-targets += config_data.h
-$(obj)/config_data.h: $(obj)/config_data.gz FORCE
- $(call filechk,ikconfiggz)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5a8e35e61835..7adc9ac7c413 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3508,7 +3508,89 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
}
-#endif
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, enum psi_res res)
+{
+ struct psi_trigger *old;
+ struct psi_trigger *new;
+ struct cgroup *cgrp;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ cgroup_get(cgrp);
+ cgroup_kn_unlock(of->kn);
+
+ new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
+ if (IS_ERR(new)) {
+ cgroup_put(cgrp);
+ return PTR_ERR(new);
+ }
+
+ old = of->priv;
+ rcu_assign_pointer(of->priv, new);
+ if (old) {
+ synchronize_rcu();
+ psi_trigger_destroy(old);
+ }
+
+ cgroup_put(cgrp);
+
+ return nbytes;
+}
+
+static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+}
+
+static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+}
+
+static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
+ poll_table *pt)
+{
+ struct psi_trigger *t;
+ __poll_t ret;
+
+ rcu_read_lock();
+ t = rcu_dereference_raw(of->priv);
+ if (t)
+ ret = psi_trigger_poll(t, of->file, pt);
+ else
+ ret = DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static void cgroup_pressure_release(struct kernfs_open_file *of)
+{
+ struct psi_trigger *t = of->priv;
+
+ if (!t)
+ return;
+
+ rcu_assign_pointer(of->priv, NULL);
+ synchronize_rcu();
+ psi_trigger_destroy(t);
+}
+#endif /* CONFIG_PSI */
static int cgroup_file_open(struct kernfs_open_file *of)
{
@@ -3577,6 +3659,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
return ret ?: nbytes;
}
+static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
+{
+ struct cftype *cft = of->kn->priv;
+
+ if (cft->poll)
+ return cft->poll(of, pt);
+
+ return kernfs_generic_poll(of, pt);
+}
+
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
return seq_cft(seq)->seq_start(seq, ppos);
@@ -3615,6 +3707,7 @@ static struct kernfs_ops cgroup_kf_single_ops = {
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
+ .poll = cgroup_file_poll,
.seq_show = cgroup_seqfile_show,
};
@@ -3623,6 +3716,7 @@ static struct kernfs_ops cgroup_kf_ops = {
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
+ .poll = cgroup_file_poll,
.seq_start = cgroup_seqfile_start,
.seq_next = cgroup_seqfile_next,
.seq_stop = cgroup_seqfile_stop,
@@ -4651,18 +4745,27 @@ static struct cftype cgroup_base_files[] = {
.name = "io.pressure",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_io_pressure_show,
+ .write = cgroup_io_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
},
{
.name = "memory.pressure",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_memory_pressure_show,
+ .write = cgroup_memory_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
},
{
.name = "cpu.pressure",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_cpu_pressure_show,
+ .write = cgroup_cpu_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
},
-#endif
+#endif /* CONFIG_PSI */
{ } /* terminate */
};
diff --git a/kernel/configs.c b/kernel/configs.c
index 2df132b20217..b062425ccf8d 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -30,37 +30,35 @@
#include <linux/init.h>
#include <linux/uaccess.h>
-/**************************************************/
-/* the actual current config file */
-
/*
- * Define kernel_config_data and kernel_config_data_size, which contains the
- * wrapped and compressed configuration file. The file is first compressed
- * with gzip and then bounded by two eight byte magic numbers to allow
- * extraction from a binary kernel image:
- *
- * IKCFG_ST
- * <image>
- * IKCFG_ED
+ * "IKCFG_ST" and "IKCFG_ED" are used to extract the config data from
+ * a binary kernel image or a module. See scripts/extract-ikconfig.
*/
-#define MAGIC_START "IKCFG_ST"
-#define MAGIC_END "IKCFG_ED"
-#include "config_data.h"
-
-
-#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
-#define kernel_config_data_size \
- (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
+asm (
+" .pushsection .rodata, \"a\" \n"
+" .ascii \"IKCFG_ST\" \n"
+" .global kernel_config_data \n"
+"kernel_config_data: \n"
+" .incbin \"kernel/config_data.gz\" \n"
+" .global kernel_config_data_end \n"
+"kernel_config_data_end: \n"
+" .ascii \"IKCFG_ED\" \n"
+" .popsection \n"
+);
#ifdef CONFIG_IKCONFIG_PROC
+extern char kernel_config_data;
+extern char kernel_config_data_end;
+
static ssize_t
ikconfig_read_current(struct file *file, char __user *buf,
size_t len, loff_t * offset)
{
return simple_read_from_buffer(buf, len, offset,
- kernel_config_data + MAGIC_SIZE,
- kernel_config_data_size);
+ &kernel_config_data,
+ &kernel_config_data_end -
+ &kernel_config_data);
}
static const struct file_operations ikconfig_file_ops = {
@@ -79,7 +77,7 @@ static int __init ikconfig_init(void)
if (!entry)
return -ENOMEM;
- proc_set_size(entry, kernel_config_data_size);
+ proc_set_size(entry, &kernel_config_data_end - &kernel_config_data);
return 0;
}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 933cb3e45b98..093c9f917ed0 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -464,6 +464,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_HUGETLB_PAGE
VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
+ VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#endif
arch_crash_save_vmcoreinfo();
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 7a723194ecbe..2b750f13bc8f 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -158,7 +158,7 @@ out:
bool dma_in_atomic_pool(void *start, size_t size)
{
- return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+ return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
}
void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
diff --git a/kernel/fork.c b/kernel/fork.c
index 08e1f9f22ddf..97d1a29babe2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -919,6 +919,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
+ tsk->memcg_high_reclaim = NULL;
#endif
return tsk;
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 1e32e66c9563..2dddecbdbe6e 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -245,8 +245,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info)
/* Duplicate gcov_info. */
active = num_counter_active(info);
- dup = kzalloc(sizeof(struct gcov_info) +
- sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
+ dup = kzalloc(struct_size(dup, counts, active), GFP_KERNEL);
if (!dup)
return NULL;
dup->version = info->version;
@@ -364,8 +363,7 @@ struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
{
struct gcov_iterator *iter;
- iter = kzalloc(sizeof(struct gcov_iterator) +
- num_counter_active(info) * sizeof(struct type_info),
+ iter = kzalloc(struct_size(iter, type_info, num_counter_active(info)),
GFP_KERNEL);
if (iter)
iter->info = info;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 4a9191617076..0c11216171c9 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -19,6 +19,7 @@
#include <linux/utsname.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
+#include <linux/sched/sysctl.h>
#include <trace/events/sched.h>
diff --git a/kernel/kcov.c b/kernel/kcov.c
index c2277dbdbfb1..2ee38727844a 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -20,6 +20,7 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/kcov.h>
+#include <linux/refcount.h>
#include <asm/setup.h>
/* Number of 64-bit words written per one comparison: */
@@ -44,7 +45,7 @@ struct kcov {
* - opened file descriptor
* - task with enabled coverage (we can't unwire it from another task)
*/
- atomic_t refcount;
+ refcount_t refcount;
/* The lock protects mode, size, area and t. */
spinlock_t lock;
enum kcov_mode mode;
@@ -228,12 +229,12 @@ EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
static void kcov_get(struct kcov *kcov)
{
- atomic_inc(&kcov->refcount);
+ refcount_inc(&kcov->refcount);
}
static void kcov_put(struct kcov *kcov)
{
- if (atomic_dec_and_test(&kcov->refcount)) {
+ if (refcount_dec_and_test(&kcov->refcount)) {
vfree(kcov->area);
kfree(kcov);
}
@@ -312,7 +313,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
if (!kcov)
return -ENOMEM;
kcov->mode = KCOV_MODE_DISABLED;
- atomic_set(&kcov->refcount, 1);
+ refcount_set(&kcov->refcount, 1);
spin_lock_init(&kcov->lock);
filep->private_data = kcov;
return nonseekable_open(inode, filep);
@@ -444,10 +445,8 @@ static int __init kcov_init(void)
* there is no need to protect it against removal races. The
* use of debugfs_create_file_unsafe() is actually safe here.
*/
- if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) {
- pr_err("failed to create kcov in debugfs\n");
- return -ENOMEM;
- }
+ debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops);
+
return 0;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 65234c89d85b..103773a8e5e9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,6 +20,7 @@
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
+#include <linux/numa.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -681,7 +682,7 @@ __kthread_create_worker(int cpu, unsigned int flags,
{
struct kthread_worker *worker;
struct task_struct *task;
- int node = -1;
+ int node = NUMA_NO_NODE;
worker = kzalloc(sizeof(*worker), GFP_KERNEL);
if (!worker)
diff --git a/kernel/module.c b/kernel/module.c
index 2ad1b5239910..0b9aa8ab89f0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2719,11 +2719,7 @@ static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsig
{
if (!debug)
return;
-#ifdef CONFIG_DYNAMIC_DEBUG
- if (ddebug_add_module(debug, num, mod->name))
- pr_err("dynamic debug error adding module: %s\n",
- debug->modname);
-#endif
+ ddebug_add_module(debug, num, mod->name);
}
static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
diff --git a/kernel/panic.c b/kernel/panic.c
index f121e6ba7e11..0ae0d7332f12 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -642,16 +642,14 @@ static int clear_warn_once_set(void *data, u64 val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
- NULL,
- clear_warn_once_set,
- "%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE(clear_warn_once_fops, NULL, clear_warn_once_set,
+ "%lld\n");
static __init int register_warn_debugfs(void)
{
/* Don't care about failure */
- debugfs_create_file("clear_warn_once", 0200, NULL,
- NULL, &clear_warn_once_fops);
+ debugfs_create_file_unsafe("clear_warn_once", 0200, NULL, NULL,
+ &clear_warn_once_fops);
return 0;
}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 640b2034edd6..4802b039b89f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1215,14 +1215,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(!PageHighMem(page));
- if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) ||
- PageReserved(page))
+ if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
+ return NULL;
+
+ if (PageReserved(page) || PageOffline(page))
return NULL;
if (page_is_guard(page))
@@ -1277,8 +1279,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(PageHighMem(page));
@@ -1286,6 +1288,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
return NULL;
+ if (PageOffline(page))
+ return NULL;
+
if (PageReserved(page)
&& (!kernel_page_present(page) || pfn_is_nosave(pfn)))
return NULL;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 771e93f9c43f..6f357f4fc859 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -29,6 +29,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/cn_proc.h>
#include <linux/compat.h>
+#include <linux/sched/signal.h>
/*
* Access another process' address space via ptrace.
@@ -924,18 +925,26 @@ int ptrace_request(struct task_struct *child, long request,
ret = ptrace_setsiginfo(child, &siginfo);
break;
- case PTRACE_GETSIGMASK:
+ case PTRACE_GETSIGMASK: {
+ sigset_t *mask;
+
if (addr != sizeof(sigset_t)) {
ret = -EINVAL;
break;
}
- if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
+ if (test_tsk_restore_sigmask(child))
+ mask = &child->saved_sigmask;
+ else
+ mask = &child->blocked;
+
+ if (copy_to_user(datavp, mask, sizeof(sigset_t)))
ret = -EFAULT;
else
ret = 0;
break;
+ }
case PTRACE_SETSIGMASK: {
sigset_t new_set;
@@ -961,6 +970,8 @@ int ptrace_request(struct task_struct *child, long request,
child->blocked = new_set;
spin_unlock_irq(&child->sighand->siglock);
+ clear_tsk_restore_sigmask(child);
+
ret = 0;
break;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e7aafbeeda2f..22d7b09d3540 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2220,6 +2220,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+#ifdef CONFIG_COMPACTION
+ p->capture_control = NULL;
+#endif
init_numa_balancing(clone_flags, p);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8213ff6e365d..ea74d43924b2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1173,7 +1173,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
/* New address space, reset the preferred nid */
if (!(clone_flags & CLONE_VM)) {
- p->numa_preferred_nid = -1;
+ p->numa_preferred_nid = NUMA_NO_NODE;
return;
}
@@ -1193,13 +1193,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
}
static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
@@ -1413,7 +1413,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
* two full passes of the "multi-stage node selection" test that is
* executed below.
*/
- if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+ if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
(cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
return true;
@@ -1861,7 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
@@ -2108,7 +2108,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
static void task_numa_placement(struct task_struct *p)
{
- int seq, nid, max_nid = -1;
+ int seq, nid, max_nid = NUMA_NO_NODE;
unsigned long max_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
unsigned long total_faults;
@@ -2651,7 +2651,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
* the preferred node.
*/
if (dst_nid == p->numa_preferred_nid ||
- (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
+ (p->numa_preferred_nid != NUMA_NO_NODE &&
+ src_nid != p->numa_preferred_nid))
return;
}
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index c3484785b179..0bb130692ad6 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -4,6 +4,9 @@
* Copyright (c) 2018 Facebook, Inc.
* Author: Johannes Weiner <hannes@cmpxchg.org>
*
+ * Polling support by Suren Baghdasaryan <surenb@google.com>
+ * Copyright (c) 2018 Google, Inc.
+ *
* When CPU, memory and IO are contended, tasks experience delays that
* reduce throughput and introduce latencies into the workload. Memory
* and IO contention, in addition, can cause a full loss of forward
@@ -127,11 +130,16 @@
#include "../workqueue_internal.h"
#include <linux/sched/loadavg.h>
#include <linux/seq_file.h>
+#include <linux/eventfd.h>
#include <linux/proc_fs.h>
#include <linux/seqlock.h>
+#include <linux/uaccess.h>
#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/sched.h>
+#include <linux/ctype.h>
+#include <linux/file.h>
+#include <linux/poll.h>
#include <linux/psi.h>
#include "sched.h"
@@ -140,9 +148,9 @@ static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED
-bool psi_enable;
+static bool psi_enable;
#else
-bool psi_enable = true;
+static bool psi_enable = true;
#endif
static int __init setup_psi(char *str)
{
@@ -151,11 +159,16 @@ static int __init setup_psi(char *str)
__setup("psi=", setup_psi);
/* Running averages - we need to be higher-res than loadavg */
-#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */
+#define PSI_FREQ (2*HZ+1UL) /* 2 sec intervals */
#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
#define EXP_60s 1981 /* 1/exp(2s/60s) */
#define EXP_300s 2034 /* 1/exp(2s/300s) */
+/* PSI trigger definitions */
+#define WINDOW_MIN_US 500000 /* Min window size is 500ms */
+#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
+#define UPDATES_PER_WINDOW 10 /* 10 updates per window */
+
/* Sampling frequency in nanoseconds */
static u64 psi_period __read_mostly;
@@ -173,9 +186,18 @@ static void group_init(struct psi_group *group)
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
- group->next_update = sched_clock() + psi_period;
+ group->avg_next_update = sched_clock() + psi_period;
+ atomic_set(&group->polling, 0);
INIT_DELAYED_WORK(&group->clock_work, psi_update_work);
- mutex_init(&group->stat_lock);
+ mutex_init(&group->update_lock);
+ /* Init trigger-related members */
+ INIT_LIST_HEAD(&group->triggers);
+ memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
+ group->trigger_states = 0;
+ group->trigger_min_period = U32_MAX;
+ memset(group->polling_total, 0, sizeof(group->polling_total));
+ group->polling_next_update = ULLONG_MAX;
+ group->polling_until = 0;
}
void __init psi_init(void)
@@ -210,20 +232,23 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
}
}
-static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
+static void get_recent_times(struct psi_group *group, int cpu, u32 *times,
+ u32 *pchanged_states)
{
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
- unsigned int tasks[NR_PSI_TASK_COUNTS];
u64 now, state_start;
+ enum psi_states s;
unsigned int seq;
- int s;
+ u32 state_mask;
+
+ *pchanged_states = 0;
/* Snapshot a coherent view of the CPU state */
do {
seq = read_seqcount_begin(&groupc->seq);
now = cpu_clock(cpu);
memcpy(times, groupc->times, sizeof(groupc->times));
- memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
+ state_mask = groupc->state_mask;
state_start = groupc->state_start;
} while (read_seqcount_retry(&groupc->seq, seq));
@@ -239,13 +264,15 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
* (u32) and our reported pressure close to what's
* actually happening.
*/
- if (test_state(tasks, s))
+ if (state_mask & (1 << s))
times[s] += now - state_start;
delta = times[s] - groupc->times_prev[s];
groupc->times_prev[s] = times[s];
times[s] = delta;
+ if (delta)
+ *pchanged_states |= (1 << s);
}
}
@@ -269,17 +296,14 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
avg[2] = calc_load(avg[2], EXP_300s, pct);
}
-static bool update_stats(struct psi_group *group)
+static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states)
{
u64 deltas[NR_PSI_STATES - 1] = { 0, };
- unsigned long missed_periods = 0;
unsigned long nonidle_total = 0;
- u64 now, expires, period;
+ u32 changed_states = 0;
int cpu;
int s;
- mutex_lock(&group->stat_lock);
-
/*
* Collect the per-cpu time buckets and average them into a
* single time sample that is normalized to wallclock time.
@@ -291,8 +315,10 @@ static bool update_stats(struct psi_group *group)
for_each_possible_cpu(cpu) {
u32 times[NR_PSI_STATES];
u32 nonidle;
+ u32 cpu_changed_states;
- get_recent_times(group, cpu, times);
+ get_recent_times(group, cpu, times, &cpu_changed_states);
+ changed_states |= cpu_changed_states;
nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
nonidle_total += nonidle;
@@ -317,12 +343,19 @@ static bool update_stats(struct psi_group *group)
for (s = 0; s < NR_PSI_STATES - 1; s++)
group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
- /* avgX= */
- now = sched_clock();
- expires = group->next_update;
- if (now < expires)
- goto out;
- if (now - expires > psi_period)
+ if (pchanged_states)
+ *pchanged_states = changed_states;
+}
+
+static u64 update_averages(struct psi_group *group, u64 now)
+{
+ unsigned long missed_periods = 0;
+ u64 expires, period;
+ u64 avg_next_update;
+ int s;
+
+ expires = group->avg_next_update;
+ if (now - expires >= psi_period)
missed_periods = div_u64(now - expires, psi_period);
/*
@@ -332,14 +365,14 @@ static bool update_stats(struct psi_group *group)
* But the deltas we sample out of the per-cpu buckets above
* are based on the actual time elapsing between clock ticks.
*/
- group->next_update = expires + ((1 + missed_periods) * psi_period);
- period = now - (group->last_update + (missed_periods * psi_period));
- group->last_update = now;
+ avg_next_update = expires + ((1 + missed_periods) * psi_period);
+ period = now - (group->avg_last_update + (missed_periods * psi_period));
+ group->avg_last_update = now;
for (s = 0; s < NR_PSI_STATES - 1; s++) {
u32 sample;
- sample = group->total[s] - group->total_prev[s];
+ sample = group->total[s] - group->avg_total[s];
/*
* Due to the lockless sampling of the time buckets,
* recorded time deltas can slip into the next period,
@@ -359,23 +392,244 @@ static bool update_stats(struct psi_group *group)
*/
if (sample > period)
sample = period;
- group->total_prev[s] += sample;
+ group->avg_total[s] += sample;
calc_avgs(group->avg[s], missed_periods, sample, period);
}
-out:
- mutex_unlock(&group->stat_lock);
- return nonidle_total;
+
+ return avg_next_update;
+}
+
+/* Trigger tracking window manupulations */
+static void window_reset(struct psi_window *win, u64 now, u64 value,
+ u64 prev_growth)
+{
+ win->start_time = now;
+ win->start_value = value;
+ win->prev_growth = prev_growth;
+}
+
+/*
+ * PSI growth tracking window update and growth calculation routine.
+ *
+ * This approximates a sliding tracking window by interpolating
+ * partially elapsed windows using historical growth data from the
+ * previous intervals. This minimizes memory requirements (by not storing
+ * all the intermediate values in the previous window) and simplifies
+ * the calculations. It works well because PSI signal changes only in
+ * positive direction and over relatively small window sizes the growth
+ * is close to linear.
+ */
+static u64 window_update(struct psi_window *win, u64 now, u64 value)
+{
+ u64 elapsed;
+ u64 growth;
+
+ elapsed = now - win->start_time;
+ growth = value - win->start_value;
+ /*
+ * After each tracking window passes win->start_value and
+ * win->start_time get reset and win->prev_growth stores
+ * the average per-window growth of the previous window.
+ * win->prev_growth is then used to interpolate additional
+ * growth from the previous window assuming it was linear.
+ */
+ if (elapsed > win->size)
+ window_reset(win, now, value, growth);
+ else {
+ u32 remaining;
+
+ remaining = win->size - elapsed;
+ growth += div_u64(win->prev_growth * remaining, win->size);
+ }
+
+ return growth;
+}
+
+static void init_triggers(struct psi_group *group, u64 now)
+{
+ struct psi_trigger *t;
+
+ list_for_each_entry(t, &group->triggers, node)
+ window_reset(&t->win, now, group->total[t->state], 0);
+ memcpy(group->polling_total, group->total,
+ sizeof(group->polling_total));
+ group->polling_next_update = now + group->trigger_min_period;
}
+static u64 update_triggers(struct psi_group *group, u64 now)
+{
+ struct psi_trigger *t;
+ bool new_stall = false;
+
+ /*
+ * On subsequent updates, calculate growth deltas and let
+ * watchers know when their specified thresholds are exceeded.
+ */
+ list_for_each_entry(t, &group->triggers, node) {
+ u64 growth;
+
+ /* Check for stall activity */
+ if (group->polling_total[t->state] == group->total[t->state])
+ continue;
+
+ /*
+ * Multiple triggers might be looking at the same state,
+ * remember to update group->polling_total[] once we've
+ * been through all of them. Also remember to extend the
+ * polling time if we see new stall activity.
+ */
+ new_stall = true;
+
+ /* Calculate growth since last update */
+ growth = window_update(&t->win, now, group->total[t->state]);
+ if (growth < t->threshold)
+ continue;
+
+ /* Limit event signaling to once per window */
+ if (now < t->last_event_time + t->win.size)
+ continue;
+
+ /* Generate an event */
+ if (cmpxchg(&t->event, 0, 1) == 0)
+ wake_up_interruptible(&t->event_wait);
+ t->last_event_time = now;
+ }
+
+ if (new_stall) {
+ memcpy(group->polling_total, group->total,
+ sizeof(group->polling_total));
+ }
+
+ return now + group->trigger_min_period;
+}
+
+/*
+ * psi_update_work represents slowpath accounting part while psi_group_change
+ * represents hotpath part. There are two potential races between them:
+ * 1. Changes to group->polling when slowpath checks for new stall, then hotpath
+ * records new stall and then slowpath resets group->polling flag. This leads
+ * to the exit from the polling mode while monitored state is still changing.
+ * 2. Slowpath overwriting an immediate update scheduled from the hotpath with
+ * a regular update further in the future and missing the immediate update.
+ * Both races are handled with a retry cycle in the slowpath:
+ *
+ * HOTPATH: | SLOWPATH:
+ * |
+ * A) times[cpu] += delta | E) delta = times[*]
+ * B) start_poll = (delta[poll_mask] &&| polling = g->polling
+ * cmpxchg(g->polling, 0, 1) == 0)| if delta[poll_mask]:
+ * if start_poll: | F) polling_until = now + grace_period
+ * C) mod_delayed_work(1) | if now > polling_until:
+ * else if !delayed_work_pending():| if polling:
+ * D) schedule_delayed_work(PSI_FREQ)| G) g->polling = polling = 0
+ * | smp_mb
+ * | H) goto SLOWPATH
+ * | else:
+ * | if !polling:
+ * | I) g->polling = polling = 1
+ * | J) if delta && first_pass:
+ * | next_avg = update_averages()
+ * | if polling:
+ * | next_poll = update_triggers()
+ * | if (delta && first_pass) || polling:
+ * | K) mod_delayed_work(
+ * | min(next_avg, next_poll))
+ * | if !polling:
+ * | first_pass = false
+ * | L) goto SLOWPATH
+ *
+ * Race #1 is represented by (EABGD) sequence in which case slowpath deactivates
+ * polling mode because it misses new monitored stall and hotpath doesn't
+ * activate it because at (B) g->polling is not yet reset by slowpath in (G).
+ * This race is handled by the (H) retry, which in the race described above
+ * results in the new sequence of (EABGDHEIK) that reactivates polling mode.
+ *
+ * Race #2 is represented by polling==false && (JABCK) sequence which overwrites
+ * immediate update scheduled at (C) with a later (next_avg) update scheduled at
+ * (K). This race is handled by the (L) retry which results in the new sequence
+ * of polling==false && (JABCKLEIK) that reactivates polling mode and
+ * reschedules the next polling update (next_poll).
+ *
+ * Note that retries can't result in an infinite loop because retry #1 happens
+ * only during polling reactivation and retry #2 happens only on the first pass.
+ * Constant reactivations are impossible because polling will stay active for at
+ * least grace_period. Worst case scenario involves two retries (HEJKLE)
+ */
static void psi_update_work(struct work_struct *work)
{
struct delayed_work *dwork;
struct psi_group *group;
+ bool first_pass = true;
+ u64 next_update;
+ u32 changed_states;
+ int polling;
bool nonidle;
+ u64 now;
dwork = to_delayed_work(work);
group = container_of(dwork, struct psi_group, clock_work);
+ mutex_lock(&group->update_lock);
+
+ now = sched_clock();
+
+retry:
+ collect_percpu_times(group, &changed_states);
+ polling = atomic_read(&group->polling);
+
+ if (changed_states & group->trigger_states) {
+ /* Initialize trigger windows when entering polling mode */
+ if (now > group->polling_until)
+ init_triggers(group, now);
+
+ /*
+ * Keep the monitor active for at least the duration of the
+ * minimum tracking window as long as monitor states are
+ * changing. This prevents frequent changes to polling flag
+ * when system bounces in and out of stall states.
+ */
+ group->polling_until = now +
+ group->trigger_min_period * UPDATES_PER_WINDOW;
+ }
+
+ /* Handle polling flag transitions */
+ if (now > group->polling_until) {
+ if (polling) {
+ group->polling_next_update = ULLONG_MAX;
+ polling = 0;
+ atomic_set(&group->polling, polling);
+ /*
+ * Memory barrier is needed to order group->polling=0
+ * write before times[] reads in collect_percpu_times()
+ * to detect possible race with hotpath that modifies
+ * times[] before it sets group->polling=1 (see Race #1
+ * description in the comments at the top).
+ */
+ smp_mb();
+ /*
+ * Check if we missed stall recorded by hotpath while
+ * polling flag was set to 1 causing hotpath to skip
+ * entering polling mode
+ */
+ goto retry;
+ }
+ } else {
+ if (!polling) {
+ /*
+ * This can happen as a fixup in the retry cycle after
+ * new stall is discovered
+ */
+ polling = 1;
+ atomic_set(&group->polling, polling);
+ }
+ }
+ /*
+ * At this point group->polling race with hotpath is resolved and
+ * we rely on local polling flag ignoring possible further changes
+ * to group->polling
+ */
+
+ nonidle = (changed_states & (1 << PSI_NONIDLE));
/*
* If there is task activity, periodically fold the per-cpu
* times and feed samples into the running averages. If things
@@ -383,18 +637,34 @@ static void psi_update_work(struct work_struct *work)
* Once restarted, we'll catch up the running averages in one
* go - see calc_avgs() and missed_periods.
*/
+ if (nonidle && first_pass) {
+ if (now >= group->avg_next_update)
+ group->avg_next_update = update_averages(group, now);
- nonidle = update_stats(group);
-
- if (nonidle) {
- unsigned long delay = 0;
- u64 now;
-
- now = sched_clock();
- if (group->next_update > now)
- delay = nsecs_to_jiffies(group->next_update - now) + 1;
- schedule_delayed_work(dwork, delay);
+ if (now >= group->polling_next_update) {
+ group->polling_next_update = update_triggers(
+ group, now);
+ }
}
+ if ((nonidle && first_pass) || polling) {
+ /* Calculate closest update time */
+ next_update = min(group->polling_next_update,
+ group->avg_next_update);
+ mod_delayed_work(system_wq, dwork, nsecs_to_jiffies(
+ next_update - now) + 1);
+ if (!polling) {
+ /*
+ * We might have overwritten an immediate update
+ * scheduled from the hotpath with a longer regular
+ * update (group->avg_next_update). Execute second pass
+ * retry to discover that and resume polling.
+ */
+ first_pass = false;
+ goto retry;
+ }
+ }
+
+ mutex_unlock(&group->update_lock);
}
static void record_times(struct psi_group_cpu *groupc, int cpu,
@@ -407,15 +677,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
delta = now - groupc->state_start;
groupc->state_start = now;
- if (test_state(groupc->tasks, PSI_IO_SOME)) {
+ if (groupc->state_mask & (1 << PSI_IO_SOME)) {
groupc->times[PSI_IO_SOME] += delta;
- if (test_state(groupc->tasks, PSI_IO_FULL))
+ if (groupc->state_mask & (1 << PSI_IO_FULL))
groupc->times[PSI_IO_FULL] += delta;
}
- if (test_state(groupc->tasks, PSI_MEM_SOME)) {
+ if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
groupc->times[PSI_MEM_SOME] += delta;
- if (test_state(groupc->tasks, PSI_MEM_FULL))
+ if (groupc->state_mask & (1 << PSI_MEM_FULL))
groupc->times[PSI_MEM_FULL] += delta;
else if (memstall_tick) {
u32 sample;
@@ -436,18 +706,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
}
}
- if (test_state(groupc->tasks, PSI_CPU_SOME))
+ if (groupc->state_mask & (1 << PSI_CPU_SOME))
groupc->times[PSI_CPU_SOME] += delta;
- if (test_state(groupc->tasks, PSI_NONIDLE))
+ if (groupc->state_mask & (1 << PSI_NONIDLE))
groupc->times[PSI_NONIDLE] += delta;
}
-static void psi_group_change(struct psi_group *group, int cpu,
+static u32 psi_group_change(struct psi_group *group, int cpu,
unsigned int clear, unsigned int set)
{
struct psi_group_cpu *groupc;
unsigned int t, m;
+ enum psi_states s;
+ u32 state_mask = 0;
groupc = per_cpu_ptr(group->pcpu, cpu);
@@ -480,7 +752,16 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (set & (1 << t))
groupc->tasks[t]++;
+ /* Calculate state mask representing active states */
+ for (s = 0; s < NR_PSI_STATES; s++) {
+ if (test_state(groupc->tasks, s))
+ state_mask |= (1 << s);
+ }
+ groupc->state_mask = state_mask;
+
write_seqcount_end(&groupc->seq);
+
+ return state_mask;
}
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -541,7 +822,27 @@ void psi_task_change(struct task_struct *task, int clear, int set)
wake_clock = false;
while ((group = iterate_groups(task, &iter))) {
- psi_group_change(group, cpu, clear, set);
+ u32 state_mask = psi_group_change(group, cpu, clear, set);
+
+ /*
+ * Polling flag resets to 0 at the max rate of once per update
+ * window (at least 500ms interval). smp_wmb is required after
+ * group->polling 0-to-1 transition to order groupc->times and
+ * group->polling writes because stall detection logic in the
+ * slowpath relies on groupc->times changing before
+ * group->polling. Explicit smp_wmb is missing because cmpxchg()
+ * implies smp_mb.
+ */
+ if ((state_mask & group->trigger_states) &&
+ atomic_cmpxchg(&group->polling, 0, 1) == 0) {
+ /*
+ * Start polling immediately even if the work is already
+ * scheduled
+ */
+ mod_delayed_work(system_wq, &group->clock_work, 1);
+ continue;
+ }
+
if (wake_clock && !delayed_work_pending(&group->clock_work))
schedule_delayed_work(&group->clock_work, PSI_FREQ);
}
@@ -642,6 +943,8 @@ void psi_cgroup_free(struct cgroup *cgroup)
cancel_delayed_work_sync(&cgroup->psi.clock_work);
free_percpu(cgroup->psi.pcpu);
+ /* All triggers must be removed by now by psi_trigger_destroy */
+ WARN_ONCE(cgroup->psi.trigger_states, "psi: trigger leak\n");
}
/**
@@ -701,7 +1004,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
- update_stats(group);
+ /* Update averages before reporting them */
+ mutex_lock(&group->update_lock);
+ collect_percpu_times(group, NULL);
+ update_averages(group, sched_clock());
+ mutex_unlock(&group->update_lock);
for (full = 0; full < 2 - (res == PSI_CPU); full++) {
unsigned long avg[3];
@@ -753,25 +1060,223 @@ static int psi_cpu_open(struct inode *inode, struct file *file)
return single_open(file, psi_cpu_show, NULL);
}
+struct psi_trigger *psi_trigger_create(struct psi_group *group,
+ char *buf, size_t nbytes, enum psi_res res)
+{
+ struct psi_trigger *t;
+ enum psi_states state;
+ u32 threshold_us;
+ u32 window_us;
+
+ if (static_branch_likely(&psi_disabled))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
+ state = PSI_IO_SOME + res * 2;
+ else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
+ state = PSI_IO_FULL + res * 2;
+ else
+ return ERR_PTR(-EINVAL);
+
+ if (state >= PSI_NONIDLE)
+ return ERR_PTR(-EINVAL);
+
+ if (window_us < WINDOW_MIN_US ||
+ window_us > WINDOW_MAX_US)
+ return ERR_PTR(-EINVAL);
+
+ /* Check threshold */
+ if (threshold_us == 0 || threshold_us > window_us)
+ return ERR_PTR(-EINVAL);
+
+ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ return ERR_PTR(-ENOMEM);
+
+ t->group = group;
+ t->state = state;
+ t->threshold = threshold_us * NSEC_PER_USEC;
+ t->win.size = window_us * NSEC_PER_USEC;
+ window_reset(&t->win, 0, 0, 0);
+
+ t->event = 0;
+ t->last_event_time = 0;
+ init_waitqueue_head(&t->event_wait);
+
+ mutex_lock(&group->update_lock);
+
+ list_add(&t->node, &group->triggers);
+ group->trigger_min_period = min(group->trigger_min_period,
+ div_u64(t->win.size, UPDATES_PER_WINDOW));
+ group->nr_triggers[t->state]++;
+ group->trigger_states |= (1 << t->state);
+
+ mutex_unlock(&group->update_lock);
+
+ return t;
+}
+
+void psi_trigger_destroy(struct psi_trigger *t)
+{
+ struct psi_group *group = t->group;
+
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ mutex_lock(&group->update_lock);
+ if (!list_empty(&t->node)) {
+ struct psi_trigger *tmp;
+ u64 period = ULLONG_MAX;
+
+ list_del(&t->node);
+ group->nr_triggers[t->state]--;
+ if (!group->nr_triggers[t->state])
+ group->trigger_states &= ~(1 << t->state);
+ /* reset min update period for the remaining triggers */
+ list_for_each_entry(tmp, &group->triggers, node) {
+ period = min(period, div_u64(tmp->win.size,
+ UPDATES_PER_WINDOW));
+ }
+ group->trigger_min_period = period;
+ /*
+ * Wakeup waiters to stop polling.
+ * Can happen if cgroup is deleted from under
+ * a polling process.
+ */
+ wake_up_interruptible(&t->event_wait);
+ kfree(t);
+ }
+ mutex_unlock(&group->update_lock);
+}
+
+__poll_t psi_trigger_poll(struct psi_trigger *t,
+ struct file *file, poll_table *wait)
+{
+ if (static_branch_likely(&psi_disabled))
+ return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+
+ poll_wait(file, &t->event_wait, wait);
+
+ if (cmpxchg(&t->event, 1, 0) == 1)
+ return DEFAULT_POLLMASK | EPOLLPRI;
+
+ /* Wait */
+ return DEFAULT_POLLMASK;
+}
+
+static ssize_t psi_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, enum psi_res res)
+{
+ char buf[32];
+ size_t buf_size;
+ struct seq_file *seq;
+ struct psi_trigger *old;
+ struct psi_trigger *new;
+
+ if (static_branch_likely(&psi_disabled))
+ return -EOPNOTSUPP;
+
+ buf_size = min(nbytes, (sizeof(buf) - 1));
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size - 1] = '\0';
+
+ new = psi_trigger_create(&psi_system, buf, nbytes, res);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ seq = file->private_data;
+ /* Take seq->lock to protect seq->private from concurrent writes */
+ mutex_lock(&seq->lock);
+ old = seq->private;
+ rcu_assign_pointer(seq->private, new);
+ mutex_unlock(&seq->lock);
+
+ if (old) {
+ synchronize_rcu();
+ psi_trigger_destroy(old);
+ }
+
+ return nbytes;
+}
+
+static ssize_t psi_io_write(struct file *file,
+ const char __user *user_buf, size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_IO);
+}
+
+static ssize_t psi_memory_write(struct file *file,
+ const char __user *user_buf, size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_MEM);
+}
+
+static ssize_t psi_cpu_write(struct file *file,
+ const char __user *user_buf, size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_CPU);
+}
+
+static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
+{
+ struct seq_file *seq = file->private_data;
+ struct psi_trigger *t;
+ __poll_t ret;
+
+ rcu_read_lock();
+ t = rcu_dereference_raw(seq->private);
+ if (t)
+ ret = psi_trigger_poll(t, file, wait);
+ else
+ ret = DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+ rcu_read_unlock();
+
+ return ret;
+
+}
+
+static int psi_fop_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct psi_trigger *t = seq->private;
+
+ if (static_branch_likely(&psi_disabled) || !t)
+ goto out;
+
+ rcu_assign_pointer(seq->private, NULL);
+ synchronize_rcu();
+ psi_trigger_destroy(t);
+out:
+ return single_release(inode, file);
+}
+
static const struct file_operations psi_io_fops = {
.open = psi_io_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .write = psi_io_write,
+ .poll = psi_fop_poll,
+ .release = psi_fop_release,
};
static const struct file_operations psi_memory_fops = {
.open = psi_memory_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .write = psi_memory_write,
+ .poll = psi_fop_poll,
+ .release = psi_fop_release,
};
static const struct file_operations psi_cpu_fops = {
.open = psi_cpu_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .write = psi_cpu_write,
+ .poll = psi_fop_poll,
+ .release = psi_fop_release,
};
static int __init psi_proc_init(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index b7953934aa99..3a9e41197d46 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3003,7 +3003,7 @@ static bool known_siginfo_layout(unsigned sig, int si_code)
if (si_code == SI_KERNEL)
return true;
else if ((si_code > SI_USER)) {
- if (sig_specific_sicodes(sig)) {
+ if (sig && sig_specific_sicodes(sig)) {
if (si_code <= sig_sicodes[sig].limit)
return true;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index c5f875048aef..12df0e5434b8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1747,6 +1747,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
if (who == RUSAGE_CHILDREN)
break;
+ /* fall through */
case RUSAGE_SELF:
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 987ae08147bf..4fce842c9bfb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -67,6 +67,8 @@
#include <linux/bpf.h>
#include <linux/mount.h>
+#include "../lib/kstrtox.h"
+
#include <linux/uaccess.h>
#include <asm/processor.h>
@@ -127,6 +129,7 @@ static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
static unsigned long one_ul = 1;
+static unsigned long long_max = LONG_MAX;
static int one_hundred = 100;
static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
@@ -1457,7 +1460,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_extfrag_threshold,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = sysctl_extfrag_handler,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_extfrag_threshold,
.extra2 = &max_extfrag_threshold,
},
@@ -1733,6 +1736,8 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(files_stat.max_files),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &long_max,
},
{
.procname = "nr_open",
@@ -2103,6 +2108,41 @@ static void proc_skip_char(char **buf, size_t *size, const char v)
}
}
+/**
+ * strtoul_lenient - parse an ASCII formatted integer from a buffer and only
+ * fail on overflow
+ *
+ * @cp: kernel buffer containing the string to parse
+ * @endp: pointer to store the trailing characters
+ * @base: the base to use
+ * @res: where the parsed integer will be stored
+ *
+ * In case of success 0 is returned and @res will contain the parsed integer,
+ * @endp will hold any trailing characters.
+ * This function will fail the parse on overflow. If there wasn't an overflow
+ * the function will defer the decision what characters count as invalid to the
+ * caller.
+ */
+static int strtoul_lenient(const char *cp, char **endp, unsigned int base,
+ unsigned long *res)
+{
+ unsigned long long result;
+ unsigned int rv;
+
+ cp = _parse_integer_fixup_radix(cp, &base);
+ rv = _parse_integer(cp, base, &result);
+ if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result))
+ return -ERANGE;
+
+ cp += rv;
+
+ if (endp)
+ *endp = (char *)cp;
+
+ *res = (unsigned long)result;
+ return 0;
+}
+
#define TMPBUFLEN 22
/**
* proc_get_long - reads an ASCII formatted integer from a user buffer
@@ -2146,7 +2186,8 @@ static int proc_get_long(char **buf, size_t *size,
if (!isdigit(*p))
return -EINVAL;
- *val = simple_strtoul(p, &p, 0);
+ if (strtoul_lenient(p, &p, 0, val))
+ return -EINVAL;
len = p - tmp;
@@ -2588,23 +2629,25 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
{
+ int tmp, ret;
struct do_proc_dointvec_minmax_conv_param *param = data;
+ /*
+ * If writing, first do so via a temporary local int so we can
+ * bounds-check it before touching *valp.
+ */
+ int *ip = write ? &tmp : valp;
+
+ ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data);
+ if (ret)
+ return ret;
+
if (write) {
- int val = *negp ? -*lvalp : *lvalp;
- if ((param->min && *param->min > val) ||
- (param->max && *param->max < val))
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
return -EINVAL;
- *valp = val;
- } else {
- int val = *valp;
- if (val < 0) {
- *negp = true;
- *lvalp = -(unsigned long)val;
- } else {
- *negp = false;
- *lvalp = (unsigned long)val;
- }
+ *valp = tmp;
}
+
return 0;
}
@@ -2653,22 +2696,22 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
unsigned int *valp,
int write, void *data)
{
+ int ret;
+ unsigned int tmp;
struct do_proc_douintvec_minmax_conv_param *param = data;
+ /* write via temporary local uint for bounds-checking */
+ unsigned int *up = write ? &tmp : valp;
- if (write) {
- unsigned int val = *lvalp;
-
- if (*lvalp > UINT_MAX)
- return -EINVAL;
+ ret = do_proc_douintvec_conv(lvalp, up, write, data);
+ if (ret)
+ return ret;
- if ((param->min && *param->min > val) ||
- (param->max && *param->max < val))
+ if (write) {
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
return -ERANGE;
- *valp = val;
- } else {
- unsigned int val = *valp;
- *lvalp = (unsigned long) val;
+ *valp = tmp;
}
return 0;
@@ -2816,8 +2859,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (neg)
continue;
val = convmul * val / convdiv;
- if ((min && val < *min) || (max && val > *max))
- continue;
+ if ((min && val < *min) || (max && val > *max)) {
+ err = -EINVAL;
+ break;
+ }
*i = val;
} else {
val = convdiv * (*i) / convmul;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8cecbe309e30..0c85fb6e2737 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -918,6 +918,16 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
* CONTEXT:
* spin_lock_irq(rq->lock)
*
+ * This function is called during schedule() when a kworker is going
+ * to sleep. It's used by psi to identify aggregation workers during
+ * dequeuing, to allow periodic aggregation to shut-off when that
+ * worker is the last task in the system or cgroup to go to sleep.
+ *
+ * As this function doesn't involve any workqueue-related locking, it
+ * only returns stable values when called from inside the scheduler's
+ * queuing and dequeuing paths, when @task, which must be a kworker,
+ * is guaranteed to not be processing any works.
+ *
* Return:
* The last work function %current executed as a worker, NULL if it
* hasn't executed any work yet.