diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2019-02-18 17:50:32 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2019-02-18 17:50:32 +1100 |
commit | 521ffb22b5e3eefc5f4a44d540acf746d6233cde (patch) | |
tree | 49d1315305821316e769260a3db93485540d89ac /kernel | |
parent | f5f8a43b28ac49c9182d5ed86724550bbaeff3e9 (diff) | |
parent | c435ec1e25d37f0778ee0077e4ed1181f38e9d5d (diff) |
Merge branch 'akpm-current/current'
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/.gitignore | 2 | ||||
-rw-r--r-- | kernel/Makefile | 11 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 107 | ||||
-rw-r--r-- | kernel/configs.c | 42 | ||||
-rw-r--r-- | kernel/crash_core.c | 2 | ||||
-rw-r--r-- | kernel/dma/remap.c | 2 | ||||
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | kernel/gcov/gcc_3_4.c | 6 | ||||
-rw-r--r-- | kernel/hung_task.c | 1 | ||||
-rw-r--r-- | kernel/kcov.c | 15 | ||||
-rw-r--r-- | kernel/kthread.c | 3 | ||||
-rw-r--r-- | kernel/module.c | 6 | ||||
-rw-r--r-- | kernel/panic.c | 10 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 17 | ||||
-rw-r--r-- | kernel/ptrace.c | 15 | ||||
-rw-r--r-- | kernel/sched/core.c | 3 | ||||
-rw-r--r-- | kernel/sched/fair.c | 15 | ||||
-rw-r--r-- | kernel/sched/psi.c | 609 | ||||
-rw-r--r-- | kernel/signal.c | 2 | ||||
-rw-r--r-- | kernel/sys.c | 1 | ||||
-rw-r--r-- | kernel/sysctl.c | 101 | ||||
-rw-r--r-- | kernel/workqueue.c | 10 |
22 files changed, 824 insertions, 157 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index b3097bde4e9c..6e699100872f 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,7 +1,5 @@ # # Generated files # -config_data.h -config_data.gz timeconst.h hz.bc diff --git a/kernel/Makefile b/kernel/Makefile index 6aa7543bcdb2..6c57e78817da 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -116,17 +116,8 @@ obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n -$(obj)/configs.o: $(obj)/config_data.h +$(obj)/configs.o: $(obj)/config_data.gz targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - -filechk_ikconfiggz = \ - echo "static const char kernel_config_data[] __used = MAGIC_START"; \ - cat $< | scripts/bin2c; \ - echo "MAGIC_END;" - -targets += config_data.h -$(obj)/config_data.h: $(obj)/config_data.gz FORCE - $(call filechk,ikconfiggz) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 5a8e35e61835..7adc9ac7c413 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3508,7 +3508,89 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); } -#endif + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, enum psi_res res) +{ + struct psi_trigger *old; + struct psi_trigger *new; + struct cgroup *cgrp; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + cgroup_get(cgrp); + cgroup_kn_unlock(of->kn); + + new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); + if (IS_ERR(new)) { + cgroup_put(cgrp); + return PTR_ERR(new); + } + + old = of->priv; + rcu_assign_pointer(of->priv, new); + if (old) { + synchronize_rcu(); + psi_trigger_destroy(old); + } + + cgroup_put(cgrp); + + return nbytes; +} + +static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_IO); +} + +static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); +} + +static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); +} + +static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, + poll_table *pt) +{ + struct psi_trigger *t; + __poll_t ret; + + rcu_read_lock(); + t = rcu_dereference_raw(of->priv); + if (t) + ret = psi_trigger_poll(t, of->file, pt); + else + ret = DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + rcu_read_unlock(); + + return ret; +} + +static void cgroup_pressure_release(struct kernfs_open_file *of) +{ + struct psi_trigger *t = of->priv; + + if (!t) + return; + + rcu_assign_pointer(of->priv, NULL); + synchronize_rcu(); + psi_trigger_destroy(t); +} +#endif /* CONFIG_PSI */ static int cgroup_file_open(struct kernfs_open_file *of) { @@ -3577,6 +3659,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) +{ + struct cftype *cft = of->kn->priv; + + if (cft->poll) + return cft->poll(of, pt); + + return kernfs_generic_poll(of, pt); +} + static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); @@ -3615,6 +3707,7 @@ static struct kernfs_ops cgroup_kf_single_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_show = cgroup_seqfile_show, }; @@ -3623,6 +3716,7 @@ static struct kernfs_ops cgroup_kf_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, @@ -4651,18 +4745,27 @@ static struct cftype cgroup_base_files[] = { .name = "io.pressure", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_io_pressure_show, + .write = cgroup_io_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "memory.pressure", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_memory_pressure_show, + .write = cgroup_memory_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "cpu.pressure", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_cpu_pressure_show, + .write = cgroup_cpu_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, -#endif +#endif /* CONFIG_PSI */ { } /* terminate */ }; diff --git a/kernel/configs.c b/kernel/configs.c index 2df132b20217..b062425ccf8d 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -30,37 +30,35 @@ #include <linux/init.h> #include <linux/uaccess.h> -/**************************************************/ -/* the actual current config file */ - /* - * Define kernel_config_data and kernel_config_data_size, which contains the - * wrapped and compressed configuration file. The file is first compressed - * with gzip and then bounded by two eight byte magic numbers to allow - * extraction from a binary kernel image: - * - * IKCFG_ST - * <image> - * IKCFG_ED + * "IKCFG_ST" and "IKCFG_ED" are used to extract the config data from + * a binary kernel image or a module. See scripts/extract-ikconfig. */ -#define MAGIC_START "IKCFG_ST" -#define MAGIC_END "IKCFG_ED" -#include "config_data.h" - - -#define MAGIC_SIZE (sizeof(MAGIC_START) - 1) -#define kernel_config_data_size \ - (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2) +asm ( +" .pushsection .rodata, \"a\" \n" +" .ascii \"IKCFG_ST\" \n" +" .global kernel_config_data \n" +"kernel_config_data: \n" +" .incbin \"kernel/config_data.gz\" \n" +" .global kernel_config_data_end \n" +"kernel_config_data_end: \n" +" .ascii \"IKCFG_ED\" \n" +" .popsection \n" +); #ifdef CONFIG_IKCONFIG_PROC +extern char kernel_config_data; +extern char kernel_config_data_end; + static ssize_t ikconfig_read_current(struct file *file, char __user *buf, size_t len, loff_t * offset) { return simple_read_from_buffer(buf, len, offset, - kernel_config_data + MAGIC_SIZE, - kernel_config_data_size); + &kernel_config_data, + &kernel_config_data_end - + &kernel_config_data); } static const struct file_operations ikconfig_file_ops = { @@ -79,7 +77,7 @@ static int __init ikconfig_init(void) if (!entry) return -ENOMEM; - proc_set_size(entry, kernel_config_data_size); + proc_set_size(entry, &kernel_config_data_end - &kernel_config_data); return 0; } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 933cb3e45b98..093c9f917ed0 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -464,6 +464,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); +#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) + VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif arch_crash_save_vmcoreinfo(); diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 7a723194ecbe..2b750f13bc8f 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -158,7 +158,7 @@ out: bool dma_in_atomic_pool(void *start, size_t size) { - return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); + return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); } void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) diff --git a/kernel/fork.c b/kernel/fork.c index 08e1f9f22ddf..97d1a29babe2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -919,6 +919,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; + tsk->memcg_high_reclaim = NULL; #endif return tsk; diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 1e32e66c9563..2dddecbdbe6e 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -245,8 +245,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info) /* Duplicate gcov_info. */ active = num_counter_active(info); - dup = kzalloc(sizeof(struct gcov_info) + - sizeof(struct gcov_ctr_info) * active, GFP_KERNEL); + dup = kzalloc(struct_size(dup, counts, active), GFP_KERNEL); if (!dup) return NULL; dup->version = info->version; @@ -364,8 +363,7 @@ struct gcov_iterator *gcov_iter_new(struct gcov_info *info) { struct gcov_iterator *iter; - iter = kzalloc(sizeof(struct gcov_iterator) + - num_counter_active(info) * sizeof(struct type_info), + iter = kzalloc(struct_size(iter, type_info, num_counter_active(info)), GFP_KERNEL); if (iter) iter->info = info; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 4a9191617076..0c11216171c9 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -19,6 +19,7 @@ #include <linux/utsname.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> +#include <linux/sched/sysctl.h> #include <trace/events/sched.h> diff --git a/kernel/kcov.c b/kernel/kcov.c index c2277dbdbfb1..2ee38727844a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -20,6 +20,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/kcov.h> +#include <linux/refcount.h> #include <asm/setup.h> /* Number of 64-bit words written per one comparison: */ @@ -44,7 +45,7 @@ struct kcov { * - opened file descriptor * - task with enabled coverage (we can't unwire it from another task) */ - atomic_t refcount; + refcount_t refcount; /* The lock protects mode, size, area and t. */ spinlock_t lock; enum kcov_mode mode; @@ -228,12 +229,12 @@ EXPORT_SYMBOL(__sanitizer_cov_trace_switch); static void kcov_get(struct kcov *kcov) { - atomic_inc(&kcov->refcount); + refcount_inc(&kcov->refcount); } static void kcov_put(struct kcov *kcov) { - if (atomic_dec_and_test(&kcov->refcount)) { + if (refcount_dec_and_test(&kcov->refcount)) { vfree(kcov->area); kfree(kcov); } @@ -312,7 +313,7 @@ static int kcov_open(struct inode *inode, struct file *filep) if (!kcov) return -ENOMEM; kcov->mode = KCOV_MODE_DISABLED; - atomic_set(&kcov->refcount, 1); + refcount_set(&kcov->refcount, 1); spin_lock_init(&kcov->lock); filep->private_data = kcov; return nonseekable_open(inode, filep); @@ -444,10 +445,8 @@ static int __init kcov_init(void) * there is no need to protect it against removal races. The * use of debugfs_create_file_unsafe() is actually safe here. */ - if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) { - pr_err("failed to create kcov in debugfs\n"); - return -ENOMEM; - } + debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops); + return 0; } diff --git a/kernel/kthread.c b/kernel/kthread.c index 65234c89d85b..103773a8e5e9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -20,6 +20,7 @@ #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> +#include <linux/numa.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -681,7 +682,7 @@ __kthread_create_worker(int cpu, unsigned int flags, { struct kthread_worker *worker; struct task_struct *task; - int node = -1; + int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) diff --git a/kernel/module.c b/kernel/module.c index 2ad1b5239910..0b9aa8ab89f0 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2719,11 +2719,7 @@ static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsig { if (!debug) return; -#ifdef CONFIG_DYNAMIC_DEBUG - if (ddebug_add_module(debug, num, mod->name)) - pr_err("dynamic debug error adding module: %s\n", - debug->modname); -#endif + ddebug_add_module(debug, num, mod->name); } static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) diff --git a/kernel/panic.c b/kernel/panic.c index f121e6ba7e11..0ae0d7332f12 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -642,16 +642,14 @@ static int clear_warn_once_set(void *data, u64 val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops, - NULL, - clear_warn_once_set, - "%lld\n"); +DEFINE_DEBUGFS_ATTRIBUTE(clear_warn_once_fops, NULL, clear_warn_once_set, + "%lld\n"); static __init int register_warn_debugfs(void) { /* Don't care about failure */ - debugfs_create_file("clear_warn_once", 0200, NULL, - NULL, &clear_warn_once_fops); + debugfs_create_file_unsafe("clear_warn_once", 0200, NULL, NULL, + &clear_warn_once_fops); return 0; } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 640b2034edd6..4802b039b89f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1215,14 +1215,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(!PageHighMem(page)); - if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || - PageReserved(page)) + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) + return NULL; + + if (PageReserved(page) || PageOffline(page)) return NULL; if (page_is_guard(page)) @@ -1277,8 +1279,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(PageHighMem(page)); @@ -1286,6 +1288,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) return NULL; + if (PageOffline(page)) + return NULL; + if (PageReserved(page) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 771e93f9c43f..6f357f4fc859 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -29,6 +29,7 @@ #include <linux/hw_breakpoint.h> #include <linux/cn_proc.h> #include <linux/compat.h> +#include <linux/sched/signal.h> /* * Access another process' address space via ptrace. @@ -924,18 +925,26 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setsiginfo(child, &siginfo); break; - case PTRACE_GETSIGMASK: + case PTRACE_GETSIGMASK: { + sigset_t *mask; + if (addr != sizeof(sigset_t)) { ret = -EINVAL; break; } - if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t))) + if (test_tsk_restore_sigmask(child)) + mask = &child->saved_sigmask; + else + mask = &child->blocked; + + if (copy_to_user(datavp, mask, sizeof(sigset_t))) ret = -EFAULT; else ret = 0; break; + } case PTRACE_SETSIGMASK: { sigset_t new_set; @@ -961,6 +970,8 @@ int ptrace_request(struct task_struct *child, long request, child->blocked = new_set; spin_unlock_irq(&child->sighand->siglock); + clear_tsk_restore_sigmask(child); + ret = 0; break; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e7aafbeeda2f..22d7b09d3540 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2220,6 +2220,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_HLIST_HEAD(&p->preempt_notifiers); #endif +#ifdef CONFIG_COMPACTION + p->capture_control = NULL; +#endif init_numa_balancing(clone_flags, p); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8213ff6e365d..ea74d43924b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1173,7 +1173,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) /* New address space, reset the preferred nid */ if (!(clone_flags & CLONE_VM)) { - p->numa_preferred_nid = -1; + p->numa_preferred_nid = NUMA_NO_NODE; return; } @@ -1193,13 +1193,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running += (p->numa_preferred_nid != -1); + rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); } static void account_numa_dequeue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running -= (p->numa_preferred_nid != -1); + rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); } @@ -1413,7 +1413,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * two full passes of the "multi-stage node selection" test that is * executed below. */ - if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) return true; @@ -1861,7 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p) unsigned long interval = HZ; /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults)) return; /* Periodically retry migrating the task to the preferred node */ @@ -2108,7 +2108,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1; + int seq, nid, max_nid = NUMA_NO_NODE; unsigned long max_faults = 0; unsigned long fault_types[2] = { 0, 0 }; unsigned long total_faults; @@ -2651,7 +2651,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) * the preferred node. */ if (dst_nid == p->numa_preferred_nid || - (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + (p->numa_preferred_nid != NUMA_NO_NODE && + src_nid != p->numa_preferred_nid)) return; } diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index c3484785b179..0bb130692ad6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -4,6 +4,9 @@ * Copyright (c) 2018 Facebook, Inc. * Author: Johannes Weiner <hannes@cmpxchg.org> * + * Polling support by Suren Baghdasaryan <surenb@google.com> + * Copyright (c) 2018 Google, Inc. + * * When CPU, memory and IO are contended, tasks experience delays that * reduce throughput and introduce latencies into the workload. Memory * and IO contention, in addition, can cause a full loss of forward @@ -127,11 +130,16 @@ #include "../workqueue_internal.h" #include <linux/sched/loadavg.h> #include <linux/seq_file.h> +#include <linux/eventfd.h> #include <linux/proc_fs.h> #include <linux/seqlock.h> +#include <linux/uaccess.h> #include <linux/cgroup.h> #include <linux/module.h> #include <linux/sched.h> +#include <linux/ctype.h> +#include <linux/file.h> +#include <linux/poll.h> #include <linux/psi.h> #include "sched.h" @@ -140,9 +148,9 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED -bool psi_enable; +static bool psi_enable; #else -bool psi_enable = true; +static bool psi_enable = true; #endif static int __init setup_psi(char *str) { @@ -151,11 +159,16 @@ static int __init setup_psi(char *str) __setup("psi=", setup_psi); /* Running averages - we need to be higher-res than loadavg */ -#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ +#define PSI_FREQ (2*HZ+1UL) /* 2 sec intervals */ #define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */ #define EXP_60s 1981 /* 1/exp(2s/60s) */ #define EXP_300s 2034 /* 1/exp(2s/300s) */ +/* PSI trigger definitions */ +#define WINDOW_MIN_US 500000 /* Min window size is 500ms */ +#define WINDOW_MAX_US 10000000 /* Max window size is 10s */ +#define UPDATES_PER_WINDOW 10 /* 10 updates per window */ + /* Sampling frequency in nanoseconds */ static u64 psi_period __read_mostly; @@ -173,9 +186,18 @@ static void group_init(struct psi_group *group) for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); - group->next_update = sched_clock() + psi_period; + group->avg_next_update = sched_clock() + psi_period; + atomic_set(&group->polling, 0); INIT_DELAYED_WORK(&group->clock_work, psi_update_work); - mutex_init(&group->stat_lock); + mutex_init(&group->update_lock); + /* Init trigger-related members */ + INIT_LIST_HEAD(&group->triggers); + memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); + group->trigger_states = 0; + group->trigger_min_period = U32_MAX; + memset(group->polling_total, 0, sizeof(group->polling_total)); + group->polling_next_update = ULLONG_MAX; + group->polling_until = 0; } void __init psi_init(void) @@ -210,20 +232,23 @@ static bool test_state(unsigned int *tasks, enum psi_states state) } } -static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +static void get_recent_times(struct psi_group *group, int cpu, u32 *times, + u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); - unsigned int tasks[NR_PSI_TASK_COUNTS]; u64 now, state_start; + enum psi_states s; unsigned int seq; - int s; + u32 state_mask; + + *pchanged_states = 0; /* Snapshot a coherent view of the CPU state */ do { seq = read_seqcount_begin(&groupc->seq); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); - memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_mask = groupc->state_mask; state_start = groupc->state_start; } while (read_seqcount_retry(&groupc->seq, seq)); @@ -239,13 +264,15 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) * (u32) and our reported pressure close to what's * actually happening. */ - if (test_state(tasks, s)) + if (state_mask & (1 << s)) times[s] += now - state_start; delta = times[s] - groupc->times_prev[s]; groupc->times_prev[s] = times[s]; times[s] = delta; + if (delta) + *pchanged_states |= (1 << s); } } @@ -269,17 +296,14 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } -static bool update_stats(struct psi_group *group) +static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; - unsigned long missed_periods = 0; unsigned long nonidle_total = 0; - u64 now, expires, period; + u32 changed_states = 0; int cpu; int s; - mutex_lock(&group->stat_lock); - /* * Collect the per-cpu time buckets and average them into a * single time sample that is normalized to wallclock time. @@ -291,8 +315,10 @@ static bool update_stats(struct psi_group *group) for_each_possible_cpu(cpu) { u32 times[NR_PSI_STATES]; u32 nonidle; + u32 cpu_changed_states; - get_recent_times(group, cpu, times); + get_recent_times(group, cpu, times, &cpu_changed_states); + changed_states |= cpu_changed_states; nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); nonidle_total += nonidle; @@ -317,12 +343,19 @@ static bool update_stats(struct psi_group *group) for (s = 0; s < NR_PSI_STATES - 1; s++) group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); - /* avgX= */ - now = sched_clock(); - expires = group->next_update; - if (now < expires) - goto out; - if (now - expires > psi_period) + if (pchanged_states) + *pchanged_states = changed_states; +} + +static u64 update_averages(struct psi_group *group, u64 now) +{ + unsigned long missed_periods = 0; + u64 expires, period; + u64 avg_next_update; + int s; + + expires = group->avg_next_update; + if (now - expires >= psi_period) missed_periods = div_u64(now - expires, psi_period); /* @@ -332,14 +365,14 @@ static bool update_stats(struct psi_group *group) * But the deltas we sample out of the per-cpu buckets above * are based on the actual time elapsing between clock ticks. */ - group->next_update = expires + ((1 + missed_periods) * psi_period); - period = now - (group->last_update + (missed_periods * psi_period)); - group->last_update = now; + avg_next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->avg_last_update + (missed_periods * psi_period)); + group->avg_last_update = now; for (s = 0; s < NR_PSI_STATES - 1; s++) { u32 sample; - sample = group->total[s] - group->total_prev[s]; + sample = group->total[s] - group->avg_total[s]; /* * Due to the lockless sampling of the time buckets, * recorded time deltas can slip into the next period, @@ -359,23 +392,244 @@ static bool update_stats(struct psi_group *group) */ if (sample > period) sample = period; - group->total_prev[s] += sample; + group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } -out: - mutex_unlock(&group->stat_lock); - return nonidle_total; + + return avg_next_update; +} + +/* Trigger tracking window manupulations */ +static void window_reset(struct psi_window *win, u64 now, u64 value, + u64 prev_growth) +{ + win->start_time = now; + win->start_value = value; + win->prev_growth = prev_growth; +} + +/* + * PSI growth tracking window update and growth calculation routine. + * + * This approximates a sliding tracking window by interpolating + * partially elapsed windows using historical growth data from the + * previous intervals. This minimizes memory requirements (by not storing + * all the intermediate values in the previous window) and simplifies + * the calculations. It works well because PSI signal changes only in + * positive direction and over relatively small window sizes the growth + * is close to linear. + */ +static u64 window_update(struct psi_window *win, u64 now, u64 value) +{ + u64 elapsed; + u64 growth; + + elapsed = now - win->start_time; + growth = value - win->start_value; + /* + * After each tracking window passes win->start_value and + * win->start_time get reset and win->prev_growth stores + * the average per-window growth of the previous window. + * win->prev_growth is then used to interpolate additional + * growth from the previous window assuming it was linear. + */ + if (elapsed > win->size) + window_reset(win, now, value, growth); + else { + u32 remaining; + + remaining = win->size - elapsed; + growth += div_u64(win->prev_growth * remaining, win->size); + } + + return growth; +} + +static void init_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + + list_for_each_entry(t, &group->triggers, node) + window_reset(&t->win, now, group->total[t->state], 0); + memcpy(group->polling_total, group->total, + sizeof(group->polling_total)); + group->polling_next_update = now + group->trigger_min_period; } +static u64 update_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + bool new_stall = false; + + /* + * On subsequent updates, calculate growth deltas and let + * watchers know when their specified thresholds are exceeded. + */ + list_for_each_entry(t, &group->triggers, node) { + u64 growth; + + /* Check for stall activity */ + if (group->polling_total[t->state] == group->total[t->state]) + continue; + + /* + * Multiple triggers might be looking at the same state, + * remember to update group->polling_total[] once we've + * been through all of them. Also remember to extend the + * polling time if we see new stall activity. + */ + new_stall = true; + + /* Calculate growth since last update */ + growth = window_update(&t->win, now, group->total[t->state]); + if (growth < t->threshold) + continue; + + /* Limit event signaling to once per window */ + if (now < t->last_event_time + t->win.size) + continue; + + /* Generate an event */ + if (cmpxchg(&t->event, 0, 1) == 0) + wake_up_interruptible(&t->event_wait); + t->last_event_time = now; + } + + if (new_stall) { + memcpy(group->polling_total, group->total, + sizeof(group->polling_total)); + } + + return now + group->trigger_min_period; +} + +/* + * psi_update_work represents slowpath accounting part while psi_group_change + * represents hotpath part. There are two potential races between them: + * 1. Changes to group->polling when slowpath checks for new stall, then hotpath + * records new stall and then slowpath resets group->polling flag. This leads + * to the exit from the polling mode while monitored state is still changing. + * 2. Slowpath overwriting an immediate update scheduled from the hotpath with + * a regular update further in the future and missing the immediate update. + * Both races are handled with a retry cycle in the slowpath: + * + * HOTPATH: | SLOWPATH: + * | + * A) times[cpu] += delta | E) delta = times[*] + * B) start_poll = (delta[poll_mask] &&| polling = g->polling + * cmpxchg(g->polling, 0, 1) == 0)| if delta[poll_mask]: + * if start_poll: | F) polling_until = now + grace_period + * C) mod_delayed_work(1) | if now > polling_until: + * else if !delayed_work_pending():| if polling: + * D) schedule_delayed_work(PSI_FREQ)| G) g->polling = polling = 0 + * | smp_mb + * | H) goto SLOWPATH + * | else: + * | if !polling: + * | I) g->polling = polling = 1 + * | J) if delta && first_pass: + * | next_avg = update_averages() + * | if polling: + * | next_poll = update_triggers() + * | if (delta && first_pass) || polling: + * | K) mod_delayed_work( + * | min(next_avg, next_poll)) + * | if !polling: + * | first_pass = false + * | L) goto SLOWPATH + * + * Race #1 is represented by (EABGD) sequence in which case slowpath deactivates + * polling mode because it misses new monitored stall and hotpath doesn't + * activate it because at (B) g->polling is not yet reset by slowpath in (G). + * This race is handled by the (H) retry, which in the race described above + * results in the new sequence of (EABGDHEIK) that reactivates polling mode. + * + * Race #2 is represented by polling==false && (JABCK) sequence which overwrites + * immediate update scheduled at (C) with a later (next_avg) update scheduled at + * (K). This race is handled by the (L) retry which results in the new sequence + * of polling==false && (JABCKLEIK) that reactivates polling mode and + * reschedules the next polling update (next_poll). + * + * Note that retries can't result in an infinite loop because retry #1 happens + * only during polling reactivation and retry #2 happens only on the first pass. + * Constant reactivations are impossible because polling will stay active for at + * least grace_period. Worst case scenario involves two retries (HEJKLE) + */ static void psi_update_work(struct work_struct *work) { struct delayed_work *dwork; struct psi_group *group; + bool first_pass = true; + u64 next_update; + u32 changed_states; + int polling; bool nonidle; + u64 now; dwork = to_delayed_work(work); group = container_of(dwork, struct psi_group, clock_work); + mutex_lock(&group->update_lock); + + now = sched_clock(); + +retry: + collect_percpu_times(group, &changed_states); + polling = atomic_read(&group->polling); + + if (changed_states & group->trigger_states) { + /* Initialize trigger windows when entering polling mode */ + if (now > group->polling_until) + init_triggers(group, now); + + /* + * Keep the monitor active for at least the duration of the + * minimum tracking window as long as monitor states are + * changing. This prevents frequent changes to polling flag + * when system bounces in and out of stall states. + */ + group->polling_until = now + + group->trigger_min_period * UPDATES_PER_WINDOW; + } + + /* Handle polling flag transitions */ + if (now > group->polling_until) { + if (polling) { + group->polling_next_update = ULLONG_MAX; + polling = 0; + atomic_set(&group->polling, polling); + /* + * Memory barrier is needed to order group->polling=0 + * write before times[] reads in collect_percpu_times() + * to detect possible race with hotpath that modifies + * times[] before it sets group->polling=1 (see Race #1 + * description in the comments at the top). + */ + smp_mb(); + /* + * Check if we missed stall recorded by hotpath while + * polling flag was set to 1 causing hotpath to skip + * entering polling mode + */ + goto retry; + } + } else { + if (!polling) { + /* + * This can happen as a fixup in the retry cycle after + * new stall is discovered + */ + polling = 1; + atomic_set(&group->polling, polling); + } + } + /* + * At this point group->polling race with hotpath is resolved and + * we rely on local polling flag ignoring possible further changes + * to group->polling + */ + + nonidle = (changed_states & (1 << PSI_NONIDLE)); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -383,18 +637,34 @@ static void psi_update_work(struct work_struct *work) * Once restarted, we'll catch up the running averages in one * go - see calc_avgs() and missed_periods. */ + if (nonidle && first_pass) { + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); - nonidle = update_stats(group); - - if (nonidle) { - unsigned long delay = 0; - u64 now; - - now = sched_clock(); - if (group->next_update > now) - delay = nsecs_to_jiffies(group->next_update - now) + 1; - schedule_delayed_work(dwork, delay); + if (now >= group->polling_next_update) { + group->polling_next_update = update_triggers( + group, now); + } } + if ((nonidle && first_pass) || polling) { + /* Calculate closest update time */ + next_update = min(group->polling_next_update, + group->avg_next_update); + mod_delayed_work(system_wq, dwork, nsecs_to_jiffies( + next_update - now) + 1); + if (!polling) { + /* + * We might have overwritten an immediate update + * scheduled from the hotpath with a longer regular + * update (group->avg_next_update). Execute second pass + * retry to discover that and resume polling. + */ + first_pass = false; + goto retry; + } + } + + mutex_unlock(&group->update_lock); } static void record_times(struct psi_group_cpu *groupc, int cpu, @@ -407,15 +677,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, delta = now - groupc->state_start; groupc->state_start = now; - if (test_state(groupc->tasks, PSI_IO_SOME)) { + if (groupc->state_mask & (1 << PSI_IO_SOME)) { groupc->times[PSI_IO_SOME] += delta; - if (test_state(groupc->tasks, PSI_IO_FULL)) + if (groupc->state_mask & (1 << PSI_IO_FULL)) groupc->times[PSI_IO_FULL] += delta; } - if (test_state(groupc->tasks, PSI_MEM_SOME)) { + if (groupc->state_mask & (1 << PSI_MEM_SOME)) { groupc->times[PSI_MEM_SOME] += delta; - if (test_state(groupc->tasks, PSI_MEM_FULL)) + if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; else if (memstall_tick) { u32 sample; @@ -436,18 +706,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, } } - if (test_state(groupc->tasks, PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) groupc->times[PSI_CPU_SOME] += delta; - if (test_state(groupc->tasks, PSI_NONIDLE)) + if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; } -static void psi_group_change(struct psi_group *group, int cpu, +static u32 psi_group_change(struct psi_group *group, int cpu, unsigned int clear, unsigned int set) { struct psi_group_cpu *groupc; unsigned int t, m; + enum psi_states s; + u32 state_mask = 0; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -480,7 +752,16 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; + /* Calculate state mask representing active states */ + for (s = 0; s < NR_PSI_STATES; s++) { + if (test_state(groupc->tasks, s)) + state_mask |= (1 << s); + } + groupc->state_mask = state_mask; + write_seqcount_end(&groupc->seq); + + return state_mask; } static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -541,7 +822,27 @@ void psi_task_change(struct task_struct *task, int clear, int set) wake_clock = false; while ((group = iterate_groups(task, &iter))) { - psi_group_change(group, cpu, clear, set); + u32 state_mask = psi_group_change(group, cpu, clear, set); + + /* + * Polling flag resets to 0 at the max rate of once per update + * window (at least 500ms interval). smp_wmb is required after + * group->polling 0-to-1 transition to order groupc->times and + * group->polling writes because stall detection logic in the + * slowpath relies on groupc->times changing before + * group->polling. Explicit smp_wmb is missing because cmpxchg() + * implies smp_mb. + */ + if ((state_mask & group->trigger_states) && + atomic_cmpxchg(&group->polling, 0, 1) == 0) { + /* + * Start polling immediately even if the work is already + * scheduled + */ + mod_delayed_work(system_wq, &group->clock_work, 1); + continue; + } + if (wake_clock && !delayed_work_pending(&group->clock_work)) schedule_delayed_work(&group->clock_work, PSI_FREQ); } @@ -642,6 +943,8 @@ void psi_cgroup_free(struct cgroup *cgroup) cancel_delayed_work_sync(&cgroup->psi.clock_work); free_percpu(cgroup->psi.pcpu); + /* All triggers must be removed by now by psi_trigger_destroy */ + WARN_ONCE(cgroup->psi.trigger_states, "psi: trigger leak\n"); } /** @@ -701,7 +1004,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; - update_stats(group); + /* Update averages before reporting them */ + mutex_lock(&group->update_lock); + collect_percpu_times(group, NULL); + update_averages(group, sched_clock()); + mutex_unlock(&group->update_lock); for (full = 0; full < 2 - (res == PSI_CPU); full++) { unsigned long avg[3]; @@ -753,25 +1060,223 @@ static int psi_cpu_open(struct inode *inode, struct file *file) return single_open(file, psi_cpu_show, NULL); } +struct psi_trigger *psi_trigger_create(struct psi_group *group, + char *buf, size_t nbytes, enum psi_res res) +{ + struct psi_trigger *t; + enum psi_states state; + u32 threshold_us; + u32 window_us; + + if (static_branch_likely(&psi_disabled)) + return ERR_PTR(-EOPNOTSUPP); + + if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_SOME + res * 2; + else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_FULL + res * 2; + else + return ERR_PTR(-EINVAL); + + if (state >= PSI_NONIDLE) + return ERR_PTR(-EINVAL); + + if (window_us < WINDOW_MIN_US || + window_us > WINDOW_MAX_US) + return ERR_PTR(-EINVAL); + + /* Check threshold */ + if (threshold_us == 0 || threshold_us > window_us) + return ERR_PTR(-EINVAL); + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return ERR_PTR(-ENOMEM); + + t->group = group; + t->state = state; + t->threshold = threshold_us * NSEC_PER_USEC; + t->win.size = window_us * NSEC_PER_USEC; + window_reset(&t->win, 0, 0, 0); + + t->event = 0; + t->last_event_time = 0; + init_waitqueue_head(&t->event_wait); + + mutex_lock(&group->update_lock); + + list_add(&t->node, &group->triggers); + group->trigger_min_period = min(group->trigger_min_period, + div_u64(t->win.size, UPDATES_PER_WINDOW)); + group->nr_triggers[t->state]++; + group->trigger_states |= (1 << t->state); + + mutex_unlock(&group->update_lock); + + return t; +} + +void psi_trigger_destroy(struct psi_trigger *t) +{ + struct psi_group *group = t->group; + + if (static_branch_likely(&psi_disabled)) + return; + + mutex_lock(&group->update_lock); + if (!list_empty(&t->node)) { + struct psi_trigger *tmp; + u64 period = ULLONG_MAX; + + list_del(&t->node); + group->nr_triggers[t->state]--; + if (!group->nr_triggers[t->state]) + group->trigger_states &= ~(1 << t->state); + /* reset min update period for the remaining triggers */ + list_for_each_entry(tmp, &group->triggers, node) { + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + } + group->trigger_min_period = period; + /* + * Wakeup waiters to stop polling. + * Can happen if cgroup is deleted from under + * a polling process. + */ + wake_up_interruptible(&t->event_wait); + kfree(t); + } + mutex_unlock(&group->update_lock); +} + +__poll_t psi_trigger_poll(struct psi_trigger *t, + struct file *file, poll_table *wait) +{ + if (static_branch_likely(&psi_disabled)) + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + + poll_wait(file, &t->event_wait, wait); + + if (cmpxchg(&t->event, 1, 0) == 1) + return DEFAULT_POLLMASK | EPOLLPRI; + + /* Wait */ + return DEFAULT_POLLMASK; +} + +static ssize_t psi_write(struct file *file, const char __user *user_buf, + size_t nbytes, enum psi_res res) +{ + char buf[32]; + size_t buf_size; + struct seq_file *seq; + struct psi_trigger *old; + struct psi_trigger *new; + + if (static_branch_likely(&psi_disabled)) + return -EOPNOTSUPP; + + buf_size = min(nbytes, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size - 1] = '\0'; + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) + return PTR_ERR(new); + + seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ + mutex_lock(&seq->lock); + old = seq->private; + rcu_assign_pointer(seq->private, new); + mutex_unlock(&seq->lock); + + if (old) { + synchronize_rcu(); + psi_trigger_destroy(old); + } + + return nbytes; +} + +static ssize_t psi_io_write(struct file *file, + const char __user *user_buf, size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IO); +} + +static ssize_t psi_memory_write(struct file *file, + const char __user *user_buf, size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_MEM); +} + +static ssize_t psi_cpu_write(struct file *file, + const char __user *user_buf, size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_CPU); +} + +static __poll_t psi_fop_poll(struct file *file, poll_table *wait) +{ + struct seq_file *seq = file->private_data; + struct psi_trigger *t; + __poll_t ret; + + rcu_read_lock(); + t = rcu_dereference_raw(seq->private); + if (t) + ret = psi_trigger_poll(t, file, wait); + else + ret = DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + rcu_read_unlock(); + + return ret; + +} + +static int psi_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct psi_trigger *t = seq->private; + + if (static_branch_likely(&psi_disabled) || !t) + goto out; + + rcu_assign_pointer(seq->private, NULL); + synchronize_rcu(); + psi_trigger_destroy(t); +out: + return single_release(inode, file); +} + static const struct file_operations psi_io_fops = { .open = psi_io_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_io_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_memory_fops = { .open = psi_memory_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_memory_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_cpu_fops = { .open = psi_cpu_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_cpu_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static int __init psi_proc_init(void) diff --git a/kernel/signal.c b/kernel/signal.c index b7953934aa99..3a9e41197d46 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3003,7 +3003,7 @@ static bool known_siginfo_layout(unsigned sig, int si_code) if (si_code == SI_KERNEL) return true; else if ((si_code > SI_USER)) { - if (sig_specific_sicodes(sig)) { + if (sig && sig_specific_sicodes(sig)) { if (si_code <= sig_sicodes[sig].limit) return true; } diff --git a/kernel/sys.c b/kernel/sys.c index c5f875048aef..12df0e5434b8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1747,6 +1747,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) if (who == RUSAGE_CHILDREN) break; + /* fall through */ case RUSAGE_SELF: thread_group_cputime_adjusted(p, &tgutime, &tgstime); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 987ae08147bf..4fce842c9bfb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -67,6 +67,8 @@ #include <linux/bpf.h> #include <linux/mount.h> +#include "../lib/kstrtox.h" + #include <linux/uaccess.h> #include <asm/processor.h> @@ -127,6 +129,7 @@ static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long one_ul = 1; +static unsigned long long_max = LONG_MAX; static int one_hundred = 100; static int one_thousand = 1000; #ifdef CONFIG_PRINTK @@ -1457,7 +1460,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_extfrag_threshold, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = sysctl_extfrag_handler, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_extfrag_threshold, .extra2 = &max_extfrag_threshold, }, @@ -1733,6 +1736,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero, + .extra2 = &long_max, }, { .procname = "nr_open", @@ -2103,6 +2108,41 @@ static void proc_skip_char(char **buf, size_t *size, const char v) } } +/** + * strtoul_lenient - parse an ASCII formatted integer from a buffer and only + * fail on overflow + * + * @cp: kernel buffer containing the string to parse + * @endp: pointer to store the trailing characters + * @base: the base to use + * @res: where the parsed integer will be stored + * + * In case of success 0 is returned and @res will contain the parsed integer, + * @endp will hold any trailing characters. + * This function will fail the parse on overflow. If there wasn't an overflow + * the function will defer the decision what characters count as invalid to the + * caller. + */ +static int strtoul_lenient(const char *cp, char **endp, unsigned int base, + unsigned long *res) +{ + unsigned long long result; + unsigned int rv; + + cp = _parse_integer_fixup_radix(cp, &base); + rv = _parse_integer(cp, base, &result); + if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result)) + return -ERANGE; + + cp += rv; + + if (endp) + *endp = (char *)cp; + + *res = (unsigned long)result; + return 0; +} + #define TMPBUFLEN 22 /** * proc_get_long - reads an ASCII formatted integer from a user buffer @@ -2146,7 +2186,8 @@ static int proc_get_long(char **buf, size_t *size, if (!isdigit(*p)) return -EINVAL; - *val = simple_strtoul(p, &p, 0); + if (strtoul_lenient(p, &p, 0, val)) + return -EINVAL; len = p - tmp; @@ -2588,23 +2629,25 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { + int tmp, ret; struct do_proc_dointvec_minmax_conv_param *param = data; + /* + * If writing, first do so via a temporary local int so we can + * bounds-check it before touching *valp. + */ + int *ip = write ? &tmp : valp; + + ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data); + if (ret) + return ret; + if (write) { - int val = *negp ? -*lvalp : *lvalp; - if ((param->min && *param->min > val) || - (param->max && *param->max < val)) + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) return -EINVAL; - *valp = val; - } else { - int val = *valp; - if (val < 0) { - *negp = true; - *lvalp = -(unsigned long)val; - } else { - *negp = false; - *lvalp = (unsigned long)val; - } + *valp = tmp; } + return 0; } @@ -2653,22 +2696,22 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) { + int ret; + unsigned int tmp; struct do_proc_douintvec_minmax_conv_param *param = data; + /* write via temporary local uint for bounds-checking */ + unsigned int *up = write ? &tmp : valp; - if (write) { - unsigned int val = *lvalp; - - if (*lvalp > UINT_MAX) - return -EINVAL; + ret = do_proc_douintvec_conv(lvalp, up, write, data); + if (ret) + return ret; - if ((param->min && *param->min > val) || - (param->max && *param->max < val)) + if (write) { + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) return -ERANGE; - *valp = val; - } else { - unsigned int val = *valp; - *lvalp = (unsigned long) val; + *valp = tmp; } return 0; @@ -2816,8 +2859,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (neg) continue; val = convmul * val / convdiv; - if ((min && val < *min) || (max && val > *max)) - continue; + if ((min && val < *min) || (max && val > *max)) { + err = -EINVAL; + break; + } *i = val; } else { val = convdiv * (*i) / convmul; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8cecbe309e30..0c85fb6e2737 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -918,6 +918,16 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) * CONTEXT: * spin_lock_irq(rq->lock) * + * This function is called during schedule() when a kworker is going + * to sleep. It's used by psi to identify aggregation workers during + * dequeuing, to allow periodic aggregation to shut-off when that + * worker is the last task in the system or cgroup to go to sleep. + * + * As this function doesn't involve any workqueue-related locking, it + * only returns stable values when called from inside the scheduler's + * queuing and dequeuing paths, when @task, which must be a kworker, + * is guaranteed to not be processing any works. + * * Return: * The last work function %current executed as a worker, NULL if it * hasn't executed any work yet. |