summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/bpf/bpf_iter.c58
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/map_iter.c37
-rw-r--r--kernel/bpf/syscall.c2
-rw-r--r--kernel/bpf/verifier.c2
-rw-r--r--kernel/crash_core.c50
-rw-r--r--kernel/dma/Kconfig8
-rw-r--r--kernel/dma/debug.c55
-rw-r--r--kernel/events/callchain.c5
-rw-r--r--kernel/events/core.c9
-rw-r--r--kernel/events/uprobes.c8
-rw-r--r--kernel/exit.c18
-rw-r--r--kernel/futex.c6
-rw-r--r--kernel/kcov.c6
-rw-r--r--kernel/kexec_file.c41
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/kthread.c5
-rw-r--r--kernel/module.c60
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/sched/core.c15
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/signal.c16
-rw-r--r--kernel/stacktrace.c5
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/sysctl_binary.c171
-rw-r--r--kernel/task_work.c8
-rw-r--r--kernel/time/Kconfig9
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/posix-cpu-timers.c216
-rw-r--r--kernel/time/sched_clock.c2
-rw-r--r--kernel/time/timekeeping.c4
-rw-r--r--kernel/time/timekeeping_internal.h11
-rw-r--r--kernel/time/timer.c1
-rw-r--r--kernel/time/vsyscall.c41
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--kernel/umh.c29
38 files changed, 522 insertions, 407 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 5350fd292910..9a20016d4900 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,7 +5,7 @@
obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o softirq.o resource.o \
- sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
+ sysctl.o capability.o ptrace.o user.o \
signal.o sys.o umh.o workqueue.o pid.o task_work.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
@@ -36,7 +36,7 @@ KCOV_INSTRUMENT_stacktrace.o := n
KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
KCSAN_SANITIZE_kcov.o := n
-CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
# cond_syscall is currently not LTO compatible
CFLAGS_sys_ni.o = $(DISABLE_LTO)
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 363b9cafc2d8..b6715964b685 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -338,8 +338,8 @@ static void bpf_iter_link_release(struct bpf_link *link)
struct bpf_iter_link *iter_link =
container_of(link, struct bpf_iter_link, link);
- if (iter_link->aux.map)
- bpf_map_put_with_uref(iter_link->aux.map);
+ if (iter_link->tinfo->reg_info->detach_target)
+ iter_link->tinfo->reg_info->detach_target(&iter_link->aux);
}
static void bpf_iter_link_dealloc(struct bpf_link *link)
@@ -390,15 +390,35 @@ bool bpf_link_is_iter(struct bpf_link *link)
int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
+ union bpf_iter_link_info __user *ulinfo;
struct bpf_link_primer link_primer;
struct bpf_iter_target_info *tinfo;
- struct bpf_iter_aux_info aux = {};
+ union bpf_iter_link_info linfo;
struct bpf_iter_link *link;
- u32 prog_btf_id, target_fd;
+ u32 prog_btf_id, linfo_len;
bool existed = false;
- struct bpf_map *map;
int err;
+ if (attr->link_create.target_fd || attr->link_create.flags)
+ return -EINVAL;
+
+ memset(&linfo, 0, sizeof(union bpf_iter_link_info));
+
+ ulinfo = u64_to_user_ptr(attr->link_create.iter_info);
+ linfo_len = attr->link_create.iter_info_len;
+ if (!ulinfo ^ !linfo_len)
+ return -EINVAL;
+
+ if (ulinfo) {
+ err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo),
+ linfo_len);
+ if (err)
+ return err;
+ linfo_len = min_t(u32, linfo_len, sizeof(linfo));
+ if (copy_from_user(&linfo, ulinfo, linfo_len))
+ return -EFAULT;
+ }
+
prog_btf_id = prog->aux->attach_btf_id;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
@@ -411,13 +431,6 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
if (!existed)
return -ENOENT;
- /* Make sure user supplied flags are target expected. */
- target_fd = attr->link_create.target_fd;
- if (attr->link_create.flags != tinfo->reg_info->req_linfo)
- return -EINVAL;
- if (!attr->link_create.flags && target_fd)
- return -EINVAL;
-
link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
if (!link)
return -ENOMEM;
@@ -431,28 +444,15 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
return err;
}
- if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) {
- map = bpf_map_get_with_uref(target_fd);
- if (IS_ERR(map)) {
- err = PTR_ERR(map);
- goto cleanup_link;
- }
-
- aux.map = map;
- err = tinfo->reg_info->check_target(prog, &aux);
+ if (tinfo->reg_info->attach_target) {
+ err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux);
if (err) {
- bpf_map_put_with_uref(map);
- goto cleanup_link;
+ bpf_link_cleanup(&link_primer);
+ return err;
}
-
- link->aux.map = map;
}
return bpf_link_settle(&link_primer);
-
-cleanup_link:
- bpf_link_cleanup(&link_primer);
- return err;
}
static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index bde93344164d..ed0b3578867c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1966,7 +1966,7 @@ void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
* @index: the index of the program to replace
*
* Skips over dummy programs, by not counting them, when calculating
- * the the position of the program to replace.
+ * the position of the program to replace.
*
* Return:
* * 0 - Success
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index fbe1f557cb88..af86048e5afd 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -98,12 +98,21 @@ static struct bpf_iter_reg bpf_map_reg_info = {
.seq_info = &bpf_map_seq_info,
};
-static int bpf_iter_check_map(struct bpf_prog *prog,
- struct bpf_iter_aux_info *aux)
+static int bpf_iter_attach_map(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
{
u32 key_acc_size, value_acc_size, key_size, value_size;
- struct bpf_map *map = aux->map;
+ struct bpf_map *map;
bool is_percpu = false;
+ int err = -EINVAL;
+
+ if (!linfo->map.map_fd)
+ return -EBADF;
+
+ map = bpf_map_get_with_uref(linfo->map.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -112,7 +121,7 @@ static int bpf_iter_check_map(struct bpf_prog *prog,
else if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY)
- return -EINVAL;
+ goto put_map;
key_acc_size = prog->aux->max_rdonly_access;
value_acc_size = prog->aux->max_rdwr_access;
@@ -122,10 +131,22 @@ static int bpf_iter_check_map(struct bpf_prog *prog,
else
value_size = round_up(map->value_size, 8) * num_possible_cpus();
- if (key_acc_size > key_size || value_acc_size > value_size)
- return -EACCES;
+ if (key_acc_size > key_size || value_acc_size > value_size) {
+ err = -EACCES;
+ goto put_map;
+ }
+ aux->map = map;
return 0;
+
+put_map:
+ bpf_map_put_with_uref(map);
+ return err;
+}
+
+static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux)
+{
+ bpf_map_put_with_uref(aux->map);
}
DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta,
@@ -133,8 +154,8 @@ DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta,
static const struct bpf_iter_reg bpf_map_elem_reg_info = {
.target = "bpf_map_elem",
- .check_target = bpf_iter_check_map,
- .req_linfo = BPF_ITER_LINK_MAP_FD,
+ .attach_target = bpf_iter_attach_map,
+ .detach_target = bpf_iter_detach_map,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_map_elem, key),
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2f343ce15747..86299a292214 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3883,7 +3883,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *
return -EINVAL;
}
-#define BPF_LINK_CREATE_LAST_FIELD link_create.flags
+#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
static int link_create(union bpf_attr *attr)
{
enum bpf_prog_type ptype;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b6ccfce3bf4c..ef938f17b944 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8294,7 +8294,7 @@ static bool stacksafe(struct bpf_func_state *old,
if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
cur->stack[spi].slot_type[i % BPF_REG_SIZE])
/* Ex: old explored (safe) state has STACK_SPILL in
- * this stack slot, but current has has STACK_MISC ->
+ * this stack slot, but current has STACK_MISC ->
* this verifier states are not equivalent,
* return false to continue verification of this path
*/
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 18175687133a..106e4500fd53 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -11,6 +11,8 @@
#include <asm/page.h>
#include <asm/sections.h>
+#include <crypto/sha.h>
+
/* vmcoreinfo stuff */
unsigned char *vmcoreinfo_data;
size_t vmcoreinfo_size;
@@ -376,6 +378,53 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
}
EXPORT_SYMBOL(paddr_vmcoreinfo_note);
+#define NOTES_SIZE (&__stop_notes - &__start_notes)
+#define BUILD_ID_MAX SHA1_DIGEST_SIZE
+#define NT_GNU_BUILD_ID 3
+
+struct elf_note_section {
+ struct elf_note n_hdr;
+ u8 n_data[];
+};
+
+/*
+ * Add build ID from .notes section as generated by the GNU ld(1)
+ * or LLVM lld(1) --build-id option.
+ */
+static void add_build_id_vmcoreinfo(void)
+{
+ char build_id[BUILD_ID_MAX * 2 + 1];
+ int n_remain = NOTES_SIZE;
+
+ while (n_remain >= sizeof(struct elf_note)) {
+ const struct elf_note_section *note_sec =
+ &__start_notes + NOTES_SIZE - n_remain;
+ const u32 n_namesz = note_sec->n_hdr.n_namesz;
+
+ if (note_sec->n_hdr.n_type == NT_GNU_BUILD_ID &&
+ n_namesz != 0 &&
+ !strcmp((char *)&note_sec->n_data[0], "GNU")) {
+ if (note_sec->n_hdr.n_descsz <= BUILD_ID_MAX) {
+ const u32 n_descsz = note_sec->n_hdr.n_descsz;
+ const u8 *s = &note_sec->n_data[n_namesz];
+
+ s = PTR_ALIGN(s, 4);
+ bin2hex(build_id, s, n_descsz);
+ build_id[2 * n_descsz] = '\0';
+ VMCOREINFO_BUILD_ID(build_id);
+ return;
+ }
+ pr_warn("Build ID is too large to include in vmcoreinfo: %u > %u\n",
+ note_sec->n_hdr.n_descsz,
+ BUILD_ID_MAX);
+ return;
+ }
+ n_remain -= sizeof(struct elf_note) +
+ ALIGN(note_sec->n_hdr.n_namesz, 4) +
+ ALIGN(note_sec->n_hdr.n_descsz, 4);
+ }
+}
+
static int __init crash_save_vmcoreinfo_init(void)
{
vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
@@ -394,6 +443,7 @@ static int __init crash_save_vmcoreinfo_init(void)
}
VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ add_build_id_vmcoreinfo();
VMCOREINFO_PAGESIZE(PAGE_SIZE);
VMCOREINFO_SYMBOL(init_uts_ns);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index f4770fcfa62b..847a9d1fa634 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -1,5 +1,8 @@
# SPDX-License-Identifier: GPL-2.0-only
+config NO_DMA
+ bool
+
config HAS_DMA
bool
depends on !NO_DMA
@@ -186,11 +189,6 @@ config DMA_API_DEBUG
drivers like double-freeing of DMA mappings or freeing mappings that
were never allocated.
- This also attempts to catch cases where a page owned by DMA is
- accessed by the cpu in a way that could cause data corruption. For
- example, this enables cow_user_page() to check that the source page is
- not undergoing DMA.
-
This option causes a performance degradation. Use only if you want to
debug device drivers and dma interactions.
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index f7f807fb6ebb..8e9f7b301c6d 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -448,9 +448,6 @@ void debug_dma_dump_mappings(struct device *dev)
* dma_active_cacheline entry to track per event. dma_map_sg(), on the
* other hand, consumes a single dma_debug_entry, but inserts 'nents'
* entries into the tree.
- *
- * At any time debug_dma_assert_idle() can be called to trigger a
- * warning if any cachelines in the given page are in the active set.
*/
static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT);
static DEFINE_SPINLOCK(radix_lock);
@@ -497,10 +494,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln)
overlap = active_cacheline_set_overlap(cln, ++overlap);
/* If we overflowed the overlap counter then we're potentially
- * leaking dma-mappings. Otherwise, if maps and unmaps are
- * balanced then this overflow may cause false negatives in
- * debug_dma_assert_idle() as the cacheline may be marked idle
- * prematurely.
+ * leaking dma-mappings.
*/
WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"),
@@ -555,53 +549,6 @@ static void active_cacheline_remove(struct dma_debug_entry *entry)
spin_unlock_irqrestore(&radix_lock, flags);
}
-/**
- * debug_dma_assert_idle() - assert that a page is not undergoing dma
- * @page: page to lookup in the dma_active_cacheline tree
- *
- * Place a call to this routine in cases where the cpu touching the page
- * before the dma completes (page is dma_unmapped) will lead to data
- * corruption.
- */
-void debug_dma_assert_idle(struct page *page)
-{
- static struct dma_debug_entry *ents[CACHELINES_PER_PAGE];
- struct dma_debug_entry *entry = NULL;
- void **results = (void **) &ents;
- unsigned int nents, i;
- unsigned long flags;
- phys_addr_t cln;
-
- if (dma_debug_disabled())
- return;
-
- if (!page)
- return;
-
- cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT;
- spin_lock_irqsave(&radix_lock, flags);
- nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln,
- CACHELINES_PER_PAGE);
- for (i = 0; i < nents; i++) {
- phys_addr_t ent_cln = to_cacheline_number(ents[i]);
-
- if (ent_cln == cln) {
- entry = ents[i];
- break;
- } else if (ent_cln >= cln + CACHELINES_PER_PAGE)
- break;
- }
- spin_unlock_irqrestore(&radix_lock, flags);
-
- if (!entry)
- return;
-
- cln = to_cacheline_number(entry);
- err_printk(entry->dev, entry,
- "cpu touching an active dma mapped cacheline [cln=%pa]\n",
- &cln);
-}
-
/*
* Wrapper function for adding an entry to the hash.
* This function takes care of locking itself.
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c6ce894e4ce9..58cbe357fb2b 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -217,10 +217,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
- fs = get_fs();
- set_fs(USER_DS);
+ fs = force_uaccess_begin();
perf_callchain_user(&ctx, regs);
- set_fs(fs);
+ force_uaccess_end(fs);
}
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d1f0a7e5b182..5bfe8e3c6e44 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6453,10 +6453,9 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
/* Data. */
sp = perf_user_stack_pointer(regs);
- fs = get_fs();
- set_fs(USER_DS);
+ fs = force_uaccess_begin();
rem = __output_copy_user(handle, (void *) sp, dump_size);
- set_fs(fs);
+ force_uaccess_end(fs);
dyn_size = dump_size - rem;
perf_output_skip(handle, rem);
@@ -11707,7 +11706,7 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_task;
/*
- * Reuse ptrace permission checks for now.
+ * Preserve ptrace permission check for backwards compatibility.
*
* We must hold exec_update_mutex across this and any potential
* perf_install_in_context() call for this new event to
@@ -11715,7 +11714,7 @@ SYSCALL_DEFINE5(perf_event_open,
* perf_event_exit_task() that could imply).
*/
err = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
goto err_cred;
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 25de10c904e6..649fd53dc9ad 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
if (new_page) {
get_page(new_page);
page_add_new_anon_rmap(new_page, vma, addr, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ lru_cache_add_inactive_or_unevictable(new_page, vma);
} else
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);
@@ -376,7 +376,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
if (!vaddr || !d)
return -EINVAL;
- ret = get_user_pages_remote(NULL, mm, vaddr, 1,
+ ret = get_user_pages_remote(mm, vaddr, 1,
FOLL_WRITE, &page, &vma, NULL);
if (unlikely(ret <= 0)) {
/*
@@ -477,7 +477,7 @@ retry:
if (is_register)
gup_flags |= FOLL_SPLIT_PMD;
/* Read the page with vaddr into memory */
- ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags,
+ ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
&old_page, &vma, NULL);
if (ret <= 0)
return ret;
@@ -2029,7 +2029,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
* but we treat this as a 'remote' access since it is
* essentially a kernel access to the memory.
*/
- result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
+ result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page,
NULL, NULL);
if (result < 0)
return result;
diff --git a/kernel/exit.c b/kernel/exit.c
index e731c414e024..733e80f334e7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -732,7 +732,7 @@ void __noreturn do_exit(long code)
* mm_release()->clear_child_tid() from writing to a user-controlled
* kernel address.
*/
- set_fs(USER_DS);
+ force_uaccess_begin();
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -1626,6 +1626,22 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
return ret;
}
+int kernel_wait(pid_t pid, int *stat)
+{
+ struct wait_opts wo = {
+ .wo_type = PIDTYPE_PID,
+ .wo_pid = find_get_pid(pid),
+ .wo_flags = WEXITED,
+ };
+ int ret;
+
+ ret = do_wait(&wo);
+ if (ret > 0 && wo.wo_stat)
+ *stat = wo.wo_stat;
+ put_pid(wo.wo_pid);
+ return ret;
+}
+
SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
int, options, struct rusage __user *, ru)
{
diff --git a/kernel/futex.c b/kernel/futex.c
index 83404124b77b..a5876694a60e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -678,7 +678,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
int ret;
mmap_read_lock(mm);
- ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
+ ret = fixup_user_fault(mm, (unsigned long)uaddr,
FAULT_FLAG_WRITE, NULL);
mmap_read_unlock(mm);
@@ -3744,12 +3744,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
- /* fall through */
+ fallthrough;
case FUTEX_WAIT_BITSET:
return futex_wait(uaddr, flags, val, timeout, val3);
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
- /* fall through */
+ fallthrough;
case FUTEX_WAKE_BITSET:
return futex_wake(uaddr, flags, val, val3);
case FUTEX_REQUEUE:
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 6afae0bcbac4..6b8368be89c8 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -96,7 +96,7 @@ struct kcov_percpu_data {
int saved_sequence;
};
-DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
+static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
/* Must be called with kcov_remote_lock locked. */
static struct kcov_remote *kcov_remote_find(u64 handle)
@@ -775,7 +775,7 @@ static inline bool kcov_mode_enabled(unsigned int mode)
return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED;
}
-void kcov_remote_softirq_start(struct task_struct *t)
+static void kcov_remote_softirq_start(struct task_struct *t)
{
struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
unsigned int mode;
@@ -792,7 +792,7 @@ void kcov_remote_softirq_start(struct task_struct *t)
}
}
-void kcov_remote_softirq_stop(struct task_struct *t)
+static void kcov_remote_softirq_stop(struct task_struct *t)
{
struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 78c0837bfd7b..ca40bef75a61 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1169,24 +1169,26 @@ int crash_exclude_mem_range(struct crash_mem *mem,
unsigned long long mstart, unsigned long long mend)
{
int i, j;
- unsigned long long start, end;
+ unsigned long long start, end, p_start, p_end;
struct crash_mem_range temp_range = {0, 0};
for (i = 0; i < mem->nr_ranges; i++) {
start = mem->ranges[i].start;
end = mem->ranges[i].end;
+ p_start = mstart;
+ p_end = mend;
if (mstart > end || mend < start)
continue;
/* Truncate any area outside of range */
if (mstart < start)
- mstart = start;
+ p_start = start;
if (mend > end)
- mend = end;
+ p_end = end;
/* Found completely overlapping range */
- if (mstart == start && mend == end) {
+ if (p_start == start && p_end == end) {
mem->ranges[i].start = 0;
mem->ranges[i].end = 0;
if (i < mem->nr_ranges - 1) {
@@ -1197,20 +1199,29 @@ int crash_exclude_mem_range(struct crash_mem *mem,
mem->ranges[j].end =
mem->ranges[j+1].end;
}
+
+ /*
+ * Continue to check if there are another overlapping ranges
+ * from the current position because of shifting the above
+ * mem ranges.
+ */
+ i--;
+ mem->nr_ranges--;
+ continue;
}
mem->nr_ranges--;
return 0;
}
- if (mstart > start && mend < end) {
+ if (p_start > start && p_end < end) {
/* Split original range */
- mem->ranges[i].end = mstart - 1;
- temp_range.start = mend + 1;
+ mem->ranges[i].end = p_start - 1;
+ temp_range.start = p_end + 1;
temp_range.end = end;
- } else if (mstart != start)
- mem->ranges[i].end = mstart - 1;
+ } else if (p_start != start)
+ mem->ranges[i].end = p_start - 1;
else
- mem->ranges[i].start = mend + 1;
+ mem->ranges[i].start = p_end + 1;
break;
}
@@ -1247,7 +1258,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
unsigned long long notes_addr;
unsigned long mstart, mend;
- /* extra phdr for vmcoreinfo elf note */
+ /* extra phdr for vmcoreinfo ELF note */
nr_phdr = nr_cpus + 1;
nr_phdr += mem->nr_ranges;
@@ -1255,7 +1266,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
* kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
* area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
* I think this is required by tools like gdb. So same physical
- * memory will be mapped in two elf headers. One will contain kernel
+ * memory will be mapped in two ELF headers. One will contain kernel
* text virtual addresses and other will have __va(physical) addresses.
*/
@@ -1282,7 +1293,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
ehdr->e_ehsize = sizeof(Elf64_Ehdr);
ehdr->e_phentsize = sizeof(Elf64_Phdr);
- /* Prepare one phdr of type PT_NOTE for each present cpu */
+ /* Prepare one phdr of type PT_NOTE for each present CPU */
for_each_present_cpu(cpu) {
phdr->p_type = PT_NOTE;
notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
@@ -1324,10 +1335,10 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
phdr->p_align = 0;
ehdr->e_phnum++;
- phdr++;
- pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
ehdr->e_phnum, phdr->p_offset);
+ phdr++;
}
*addr = buf;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 37c3c4b97b8e..3cd075ce2a1e 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,9 +36,8 @@
*
* If you need less than 50 threads would mean we're dealing with systems
* smaller than 3200 pages. This assumes you are capable of having ~13M memory,
- * and this would only be an be an upper limit, after which the OOM killer
- * would take effect. Systems like these are very unlikely if modules are
- * enabled.
+ * and this would only be an upper limit, after which the OOM killer would take
+ * effect. Systems like these are very unlikely if modules are enabled.
*/
#define MAX_KMOD_CONCURRENT 50
static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b2807e7be772..3edaa380dc7b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1258,8 +1258,7 @@ void kthread_use_mm(struct mm_struct *mm)
if (active_mm != mm)
mmdrop(active_mm);
- to_kthread(tsk)->oldfs = get_fs();
- set_fs(USER_DS);
+ to_kthread(tsk)->oldfs = force_uaccess_begin();
}
EXPORT_SYMBOL_GPL(kthread_use_mm);
@@ -1274,7 +1273,7 @@ void kthread_unuse_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(!tsk->mm);
- set_fs(to_kthread(tsk)->oldfs);
+ force_uaccess_end(to_kthread(tsk)->oldfs);
task_lock(tsk);
sync_mm_rss(mm);
diff --git a/kernel/module.c b/kernel/module.c
index 8fa2600bde6a..1c5cff34d9f2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -422,7 +422,7 @@ static bool each_symbol_in_section(const struct symsearch *arr,
}
/* Returns true as soon as fn returns true, otherwise false. */
-bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
+static bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
struct module *owner,
void *data),
void *data)
@@ -484,7 +484,6 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
}
return false;
}
-EXPORT_SYMBOL_GPL(each_symbol_section);
struct find_symbol_arg {
/* Input */
@@ -496,6 +495,7 @@ struct find_symbol_arg {
struct module *owner;
const s32 *crc;
const struct kernel_symbol *sym;
+ enum mod_license license;
};
static bool check_exported_symbol(const struct symsearch *syms,
@@ -505,9 +505,9 @@ static bool check_exported_symbol(const struct symsearch *syms,
struct find_symbol_arg *fsa = data;
if (!fsa->gplok) {
- if (syms->licence == GPL_ONLY)
+ if (syms->license == GPL_ONLY)
return false;
- if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
+ if (syms->license == WILL_BE_GPL_ONLY && fsa->warn) {
pr_warn("Symbol %s is being used by a non-GPL module, "
"which will not be allowed in the future\n",
fsa->name);
@@ -529,6 +529,7 @@ static bool check_exported_symbol(const struct symsearch *syms,
fsa->owner = owner;
fsa->crc = symversion(syms->crcs, symnum);
fsa->sym = &syms->start[symnum];
+ fsa->license = syms->license;
return true;
}
@@ -585,9 +586,10 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms,
/* Find an exported symbol and return it, along with, (optional) crc and
* (optional) module which owns it. Needs preempt disabled or module_mutex. */
-const struct kernel_symbol *find_symbol(const char *name,
+static const struct kernel_symbol *find_symbol(const char *name,
struct module **owner,
const s32 **crc,
+ enum mod_license *license,
bool gplok,
bool warn)
{
@@ -602,13 +604,14 @@ const struct kernel_symbol *find_symbol(const char *name,
*owner = fsa.owner;
if (crc)
*crc = fsa.crc;
+ if (license)
+ *license = fsa.license;
return fsa.sym;
}
pr_debug("Failed to find symbol %s\n", name);
return NULL;
}
-EXPORT_SYMBOL_GPL(find_symbol);
/*
* Search for module by name: must hold module_mutex (or preempt disabled
@@ -869,7 +872,7 @@ static int add_module_usage(struct module *a, struct module *b)
}
/* Module a uses b: caller needs module_mutex() */
-int ref_module(struct module *a, struct module *b)
+static int ref_module(struct module *a, struct module *b)
{
int err;
@@ -888,7 +891,6 @@ int ref_module(struct module *a, struct module *b)
}
return 0;
}
-EXPORT_SYMBOL_GPL(ref_module);
/* Clear the unload stuff of the module. */
static void module_unload_free(struct module *mod)
@@ -1077,7 +1079,7 @@ void __symbol_put(const char *symbol)
struct module *owner;
preempt_disable();
- if (!find_symbol(symbol, &owner, NULL, true, false))
+ if (!find_symbol(symbol, &owner, NULL, NULL, true, false))
BUG();
module_put(owner);
preempt_enable();
@@ -1169,11 +1171,10 @@ static inline void module_unload_free(struct module *mod)
{
}
-int ref_module(struct module *a, struct module *b)
+static int ref_module(struct module *a, struct module *b)
{
return strong_try_module_get(b);
}
-EXPORT_SYMBOL_GPL(ref_module);
static inline int module_unload_init(struct module *mod)
{
@@ -1356,7 +1357,7 @@ static inline int check_modstruct_version(const struct load_info *info,
* locking is necessary -- use preempt_disable() to placate lockdep.
*/
preempt_disable();
- if (!find_symbol("module_layout", NULL, &crc, true, false)) {
+ if (!find_symbol("module_layout", NULL, &crc, NULL, true, false)) {
preempt_enable();
BUG();
}
@@ -1430,6 +1431,24 @@ static int verify_namespace_is_imported(const struct load_info *info,
return 0;
}
+static bool inherit_taint(struct module *mod, struct module *owner)
+{
+ if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
+ return true;
+
+ if (mod->using_gplonly_symbols) {
+ pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n",
+ mod->name, owner->name);
+ return false;
+ }
+
+ if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
+ pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n",
+ mod->name, owner->name);
+ set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
+ }
+ return true;
+}
/* Resolve a symbol for this module. I.e. if we find one, record usage. */
static const struct kernel_symbol *resolve_symbol(struct module *mod,
@@ -1440,6 +1459,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
struct module *owner;
const struct kernel_symbol *sym;
const s32 *crc;
+ enum mod_license license;
int err;
/*
@@ -1449,11 +1469,19 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
*/
sched_annotate_sleep();
mutex_lock(&module_mutex);
- sym = find_symbol(name, &owner, &crc,
+ sym = find_symbol(name, &owner, &crc, &license,
!(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
if (!sym)
goto unlock;
+ if (license == GPL_ONLY)
+ mod->using_gplonly_symbols = true;
+
+ if (!inherit_taint(mod, owner)) {
+ sym = NULL;
+ goto getname;
+ }
+
if (!check_version(info, name, mod, crc)) {
sym = ERR_PTR(-EINVAL);
goto getname;
@@ -2236,7 +2264,7 @@ void *__symbol_get(const char *symbol)
const struct kernel_symbol *sym;
preempt_disable();
- sym = find_symbol(symbol, &owner, NULL, true, true);
+ sym = find_symbol(symbol, &owner, NULL, NULL, true, true);
if (sym && strong_try_module_get(owner))
sym = NULL;
preempt_enable();
@@ -2272,7 +2300,7 @@ static int verify_exported_symbols(struct module *mod)
for (i = 0; i < ARRAY_SIZE(arr); i++) {
for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
if (find_symbol(kernel_symbol_name(s), &owner, NULL,
- true, false)) {
+ NULL, true, false)) {
pr_err("%s: exports duplicate symbol %s"
" (owned by %s)\n",
mod->name, kernel_symbol_name(s),
@@ -4489,7 +4517,6 @@ struct module *__module_address(unsigned long addr)
}
return mod;
}
-EXPORT_SYMBOL_GPL(__module_address);
/*
* is_module_text_address - is this address inside module code?
@@ -4528,7 +4555,6 @@ struct module *__module_text_address(unsigned long addr)
}
return mod;
}
-EXPORT_SYMBOL_GPL(__module_text_address);
/* Don't grab lock, we're oopsing. */
void print_modules(void)
diff --git a/kernel/panic.c b/kernel/panic.c
index e2157ca387c8..aef8872ba843 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -505,7 +505,7 @@ static void do_oops_enter_exit(void)
* Return true if the calling CPU is allowed to print oops-related info.
* This is a bit racy..
*/
-int oops_may_print(void)
+bool oops_may_print(void)
{
return pause_on_oops_flag == 0;
}
@@ -551,7 +551,7 @@ static int init_oops_id(void)
}
late_initcall(init_oops_id);
-void print_oops_end_marker(void)
+static void print_oops_end_marker(void)
{
init_oops_id();
pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 84758f34cdb0..8471a0f7eb32 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6431,10 +6431,10 @@ void sched_show_task(struct task_struct *p)
if (!try_get_task_stack(p))
return;
- printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
if (p->state == TASK_RUNNING)
- printk(KERN_CONT " running task ");
+ pr_cont(" running task ");
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
@@ -6443,8 +6443,8 @@ void sched_show_task(struct task_struct *p)
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), ppid,
+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
+ free, task_pid_nr(p), ppid,
(unsigned long)task_thread_info(p)->flags);
print_worker_info(KERN_INFO, p);
@@ -6479,13 +6479,6 @@ void show_state_filter(unsigned long state_filter)
{
struct task_struct *g, *p;
-#if BITS_PER_LONG == 32
- printk(KERN_INFO
- " task PC stack pid father\n");
-#else
- printk(KERN_INFO
- " task PC stack pid father\n");
-#endif
rcu_read_lock();
for_each_process_thread(g, p) {
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3fd283892761..28709f6b0975 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1999,7 +1999,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
if (trace_sched_update_nr_running_tp_enabled()) {
- call_trace_sched_update_nr_running(rq, count);
+ call_trace_sched_update_nr_running(rq, -count);
}
/* Check if we still need preemption */
diff --git a/kernel/signal.c b/kernel/signal.c
index 6f16f7c5d375..42b67d2cea37 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2541,7 +2541,21 @@ bool get_signal(struct ksignal *ksig)
relock:
spin_lock_irq(&sighand->siglock);
- current->jobctl &= ~JOBCTL_TASK_WORK;
+ /*
+ * Make sure we can safely read ->jobctl() in task_work add. As Oleg
+ * states:
+ *
+ * It pairs with mb (implied by cmpxchg) before READ_ONCE. So we
+ * roughly have
+ *
+ * task_work_add: get_signal:
+ * STORE(task->task_works, new_work); STORE(task->jobctl);
+ * mb(); mb();
+ * LOAD(task->jobctl); LOAD(task->task_works);
+ *
+ * and we can rely on STORE-MB-LOAD [ in task_work_add].
+ */
+ smp_store_mb(current->jobctl, current->jobctl & ~JOBCTL_TASK_WORK);
if (unlikely(current->task_works)) {
spin_unlock_irq(&sighand->siglock);
task_work_run();
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 2af66e449aa6..946f44a9e86a 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -233,10 +233,9 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
if (current->flags & PF_KTHREAD)
return 0;
- fs = get_fs();
- set_fs(USER_DS);
+ fs = force_uaccess_begin();
arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
- set_fs(fs);
+ force_uaccess_end(fs);
return c.len;
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 3b69a560a7ac..4d59775ea79c 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -364,7 +364,6 @@ COND_SYSCALL(socketcall);
COND_SYSCALL_COMPAT(socketcall);
/* compat syscalls for arm64, x86, ... */
-COND_SYSCALL_COMPAT(sysctl);
COND_SYSCALL_COMPAT(fanotify_mark);
/* x86 */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f785de3caac0..287862f91717 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2852,6 +2852,15 @@ static struct ctl_table vm_table[] = {
.proc_handler = sysctl_compaction_handler,
},
{
+ .procname = "compaction_proactiveness",
+ .data = &sysctl_compaction_proactiveness,
+ .maxlen = sizeof(sysctl_compaction_proactiveness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &one_hundred,
+ },
+ {
.procname = "extfrag_threshold",
.data = &sysctl_extfrag_threshold,
.maxlen = sizeof(int),
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
deleted file mode 100644
index 7d550cc76a3b..000000000000
--- a/kernel/sysctl_binary.c
+++ /dev/null
@@ -1,171 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include "../fs/xfs/xfs_sysctl.h"
-#include <linux/sunrpc/debug.h>
-#include <linux/string.h>
-#include <linux/syscalls.h>
-#include <linux/namei.h>
-#include <linux/mount.h>
-#include <linux/fs.h>
-#include <linux/nsproxy.h>
-#include <linux/pid_namespace.h>
-#include <linux/file.h>
-#include <linux/ctype.h>
-#include <linux/netdevice.h>
-#include <linux/kernel.h>
-#include <linux/uuid.h>
-#include <linux/slab.h>
-#include <linux/compat.h>
-
-static ssize_t binary_sysctl(const int *name, int nlen,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- return -ENOSYS;
-}
-
-static void deprecated_sysctl_warning(const int *name, int nlen)
-{
- int i;
-
- /*
- * CTL_KERN/KERN_VERSION is used by older glibc and cannot
- * ever go away.
- */
- if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION)
- return;
-
- if (printk_ratelimit()) {
- printk(KERN_INFO
- "warning: process `%s' used the deprecated sysctl "
- "system call with ", current->comm);
- for (i = 0; i < nlen; i++)
- printk(KERN_CONT "%d.", name[i]);
- printk(KERN_CONT "\n");
- }
- return;
-}
-
-#define WARN_ONCE_HASH_BITS 8
-#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS)
-
-static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE);
-
-#define FNV32_OFFSET 2166136261U
-#define FNV32_PRIME 0x01000193
-
-/*
- * Print each legacy sysctl (approximately) only once.
- * To avoid making the tables non-const use a external
- * hash-table instead.
- * Worst case hash collision: 6, but very rarely.
- * NOTE! We don't use the SMP-safe bit tests. We simply
- * don't care enough.
- */
-static void warn_on_bintable(const int *name, int nlen)
-{
- int i;
- u32 hash = FNV32_OFFSET;
-
- for (i = 0; i < nlen; i++)
- hash = (hash ^ name[i]) * FNV32_PRIME;
- hash %= WARN_ONCE_HASH_SIZE;
- if (__test_and_set_bit(hash, warn_once_bitmap))
- return;
- deprecated_sysctl_warning(name, nlen);
-}
-
-static ssize_t do_sysctl(int __user *args_name, int nlen,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- int name[CTL_MAXNAME];
- int i;
-
- /* Check args->nlen. */
- if (nlen < 0 || nlen > CTL_MAXNAME)
- return -ENOTDIR;
- /* Read in the sysctl name for simplicity */
- for (i = 0; i < nlen; i++)
- if (get_user(name[i], args_name + i))
- return -EFAULT;
-
- warn_on_bintable(name, nlen);
-
- return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
-}
-
-SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
-{
- struct __sysctl_args tmp;
- size_t oldlen = 0;
- ssize_t result;
-
- if (copy_from_user(&tmp, args, sizeof(tmp)))
- return -EFAULT;
-
- if (tmp.oldval && !tmp.oldlenp)
- return -EFAULT;
-
- if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp))
- return -EFAULT;
-
- result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen,
- tmp.newval, tmp.newlen);
-
- if (result >= 0) {
- oldlen = result;
- result = 0;
- }
-
- if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp))
- return -EFAULT;
-
- return result;
-}
-
-
-#ifdef CONFIG_COMPAT
-
-struct compat_sysctl_args {
- compat_uptr_t name;
- int nlen;
- compat_uptr_t oldval;
- compat_uptr_t oldlenp;
- compat_uptr_t newval;
- compat_size_t newlen;
- compat_ulong_t __unused[4];
-};
-
-COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args)
-{
- struct compat_sysctl_args tmp;
- compat_size_t __user *compat_oldlenp;
- size_t oldlen = 0;
- ssize_t result;
-
- if (copy_from_user(&tmp, args, sizeof(tmp)))
- return -EFAULT;
-
- if (tmp.oldval && !tmp.oldlenp)
- return -EFAULT;
-
- compat_oldlenp = compat_ptr(tmp.oldlenp);
- if (compat_oldlenp && get_user(oldlen, compat_oldlenp))
- return -EFAULT;
-
- result = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
- compat_ptr(tmp.oldval), oldlen,
- compat_ptr(tmp.newval), tmp.newlen);
-
- if (result >= 0) {
- oldlen = result;
- result = 0;
- }
-
- if (compat_oldlenp && put_user(oldlen, compat_oldlenp))
- return -EFAULT;
-
- return result;
-}
-
-#endif /* CONFIG_COMPAT */
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 5c0848ca1287..613b2d634af8 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -42,7 +42,13 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify)
set_notify_resume(task);
break;
case TWA_SIGNAL:
- if (lock_task_sighand(task, &flags)) {
+ /*
+ * Only grab the sighand lock if we don't already have some
+ * task_work pending. This pairs with the smp_store_mb()
+ * in get_signal(), see comment there.
+ */
+ if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) &&
+ lock_task_sighand(task, &flags)) {
task->jobctl |= JOBCTL_TASK_WORK;
signal_wake_up(task, 0);
unlock_task_sighand(task, &flags);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index fcc42353f125..a09b1d61df6a 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -52,6 +52,15 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST
config GENERIC_CMOS_UPDATE
bool
+# Select to handle posix CPU timers from task_work
+# and not from the timer interrupt context
+config HAVE_POSIX_CPU_TIMERS_TASK_WORK
+ bool
+
+config POSIX_CPU_TIMERS_TASK_WORK
+ bool
+ default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK
+
if GENERIC_CLOCKEVENTS
menu "Timers subsystem"
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2ffb466af77e..ca223a89530a 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -192,7 +192,7 @@ static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
* When a alarm timer fires, this runs through the timerqueue to
* see which alarms expired, and runs those. If there are more alarm
* timers queued for the future, we set the hrtimer to fire when
- * when the next future alarm timer expires.
+ * the next future alarm timer expires.
*/
static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
{
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 165117996ea0..a71758e34e45 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -377,6 +377,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
*/
static int posix_cpu_timer_create(struct k_itimer *new_timer)
{
+ static struct lock_class_key posix_cpu_timers_key;
struct pid *pid;
rcu_read_lock();
@@ -386,6 +387,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
return -EINVAL;
}
+ /*
+ * If posix timer expiry is handled in task work context then
+ * timer::it_lock can be taken without disabling interrupts as all
+ * other locking happens in task context. This requires a seperate
+ * lock class key otherwise regular posix timer expiry would record
+ * the lock class being taken in interrupt context and generate a
+ * false positive warning.
+ */
+ if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK))
+ lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key);
+
new_timer->kclock = &clock_posix_cpu;
timerqueue_init(&new_timer->it.cpu.node);
new_timer->it.cpu.pid = get_pid(pid);
@@ -1080,43 +1092,163 @@ static inline bool fastpath_timer_check(struct task_struct *tsk)
return false;
}
+static void handle_posix_cpu_timers(struct task_struct *tsk);
+
+#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
+static void posix_cpu_timers_work(struct callback_head *work)
+{
+ handle_posix_cpu_timers(current);
+}
+
/*
- * This is called from the timer interrupt handler. The irq handler has
- * already updated our counts. We need to check if any timers fire now.
- * Interrupts are disabled.
+ * Initialize posix CPU timers task work in init task. Out of line to
+ * keep the callback static and to avoid header recursion hell.
*/
-void run_posix_cpu_timers(void)
+void __init posix_cputimers_init_work(void)
{
- struct task_struct *tsk = current;
- struct k_itimer *timer, *next;
- unsigned long flags;
- LIST_HEAD(firing);
+ init_task_work(&current->posix_cputimers_work.work,
+ posix_cpu_timers_work);
+}
- lockdep_assert_irqs_disabled();
+/*
+ * Note: All operations on tsk->posix_cputimer_work.scheduled happen either
+ * in hard interrupt context or in task context with interrupts
+ * disabled. Aside of that the writer/reader interaction is always in the
+ * context of the current task, which means they are strict per CPU.
+ */
+static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
+{
+ return tsk->posix_cputimers_work.scheduled;
+}
- /*
- * The fast path checks that there are no expired thread or thread
- * group timers. If that's so, just return.
- */
- if (!fastpath_timer_check(tsk))
+static inline void __run_posix_cpu_timers(struct task_struct *tsk)
+{
+ if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled))
return;
- lockdep_posixtimer_enter();
- if (!lock_task_sighand(tsk, &flags)) {
- lockdep_posixtimer_exit();
- return;
+ /* Schedule task work to actually expire the timers */
+ tsk->posix_cputimers_work.scheduled = true;
+ task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME);
+}
+
+static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
+ unsigned long start)
+{
+ bool ret = true;
+
+ /*
+ * On !RT kernels interrupts are disabled while collecting expired
+ * timers, so no tick can happen and the fast path check can be
+ * reenabled without further checks.
+ */
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ tsk->posix_cputimers_work.scheduled = false;
+ return true;
}
+
/*
- * Here we take off tsk->signal->cpu_timers[N] and
- * tsk->cpu_timers[N] all the timers that are firing, and
- * put them on the firing list.
+ * On RT enabled kernels ticks can happen while the expired timers
+ * are collected under sighand lock. But any tick which observes
+ * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath
+ * checks. So reenabling the tick work has do be done carefully:
+ *
+ * Disable interrupts and run the fast path check if jiffies have
+ * advanced since the collecting of expired timers started. If
+ * jiffies have not advanced or the fast path check did not find
+ * newly expired timers, reenable the fast path check in the timer
+ * interrupt. If there are newly expired timers, return false and
+ * let the collection loop repeat.
*/
- check_thread_timers(tsk, &firing);
+ local_irq_disable();
+ if (start != jiffies && fastpath_timer_check(tsk))
+ ret = false;
+ else
+ tsk->posix_cputimers_work.scheduled = false;
+ local_irq_enable();
+
+ return ret;
+}
+#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
+static inline void __run_posix_cpu_timers(struct task_struct *tsk)
+{
+ lockdep_posixtimer_enter();
+ handle_posix_cpu_timers(tsk);
+ lockdep_posixtimer_exit();
+}
+
+static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
+{
+ return false;
+}
+
+static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
+ unsigned long start)
+{
+ return true;
+}
+#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
+
+static void handle_posix_cpu_timers(struct task_struct *tsk)
+{
+ struct k_itimer *timer, *next;
+ unsigned long flags, start;
+ LIST_HEAD(firing);
+
+ if (!lock_task_sighand(tsk, &flags))
+ return;
- check_process_timers(tsk, &firing);
+ do {
+ /*
+ * On RT locking sighand lock does not disable interrupts,
+ * so this needs to be careful vs. ticks. Store the current
+ * jiffies value.
+ */
+ start = READ_ONCE(jiffies);
+ barrier();
+
+ /*
+ * Here we take off tsk->signal->cpu_timers[N] and
+ * tsk->cpu_timers[N] all the timers that are firing, and
+ * put them on the firing list.
+ */
+ check_thread_timers(tsk, &firing);
+
+ check_process_timers(tsk, &firing);
+
+ /*
+ * The above timer checks have updated the exipry cache and
+ * because nothing can have queued or modified timers after
+ * sighand lock was taken above it is guaranteed to be
+ * consistent. So the next timer interrupt fastpath check
+ * will find valid data.
+ *
+ * If timer expiry runs in the timer interrupt context then
+ * the loop is not relevant as timers will be directly
+ * expired in interrupt context. The stub function below
+ * returns always true which allows the compiler to
+ * optimize the loop out.
+ *
+ * If timer expiry is deferred to task work context then
+ * the following rules apply:
+ *
+ * - On !RT kernels no tick can have happened on this CPU
+ * after sighand lock was acquired because interrupts are
+ * disabled. So reenabling task work before dropping
+ * sighand lock and reenabling interrupts is race free.
+ *
+ * - On RT kernels ticks might have happened but the tick
+ * work ignored posix CPU timer handling because the
+ * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work
+ * must be done very carefully including a check whether
+ * ticks have happened since the start of the timer
+ * expiry checks. posix_cpu_timers_enable_work() takes
+ * care of that and eventually lets the expiry checks
+ * run again.
+ */
+ } while (!posix_cpu_timers_enable_work(tsk, start));
/*
- * We must release these locks before taking any timer's lock.
+ * We must release sighand lock before taking any timer's lock.
* There is a potential race with timer deletion here, as the
* siglock now protects our private firing list. We have set
* the firing flag in each timer, so that a deletion attempt
@@ -1134,6 +1266,13 @@ void run_posix_cpu_timers(void)
list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
int cpu_firing;
+ /*
+ * spin_lock() is sufficient here even independent of the
+ * expiry context. If expiry happens in hard interrupt
+ * context it's obvious. For task work context it's safe
+ * because all other operations on timer::it_lock happen in
+ * task context (syscall or exit).
+ */
spin_lock(&timer->it_lock);
list_del_init(&timer->it.cpu.elist);
cpu_firing = timer->it.cpu.firing;
@@ -1147,7 +1286,34 @@ void run_posix_cpu_timers(void)
cpu_timer_fire(timer);
spin_unlock(&timer->it_lock);
}
- lockdep_posixtimer_exit();
+}
+
+/*
+ * This is called from the timer interrupt handler. The irq handler has
+ * already updated our counts. We need to check if any timers fire now.
+ * Interrupts are disabled.
+ */
+void run_posix_cpu_timers(void)
+{
+ struct task_struct *tsk = current;
+
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * If the actual expiry is deferred to task work context and the
+ * work is already scheduled there is no point to do anything here.
+ */
+ if (posix_cpu_timers_work_scheduled(tsk))
+ return;
+
+ /*
+ * The fast path checks that there are no expired thread or thread
+ * group timers. If that's so, just return.
+ */
+ if (!fastpath_timer_check(tsk))
+ return;
+
+ __run_posix_cpu_timers(tsk);
}
/*
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0deaf4b79fb4..1c03eec6ca9b 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -229,7 +229,7 @@ void __init generic_sched_clock_init(void)
{
/*
* If no sched_clock() function has been provided at that point,
- * make it the final one one.
+ * make it the final one.
*/
if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 406306b33452..4c47f388a83f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -39,7 +39,7 @@ enum timekeeping_adv_mode {
TK_ADV_FREQ
};
-static DEFINE_RAW_SPINLOCK(timekeeper_lock);
+DEFINE_RAW_SPINLOCK(timekeeper_lock);
/*
* The most important data for readout fits into a single 64 byte
@@ -2004,7 +2004,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
* logarithmic_accumulation - shifted accumulation of cycles
*
* This functions accumulates a shifted interval of cycles into
- * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * a shifted interval nanoseconds. Allows for O(log) accumulation
* loop.
*
* Returns the unconsumed cycles.
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index bcbb52db2256..4ca2787d1642 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -1,12 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _TIMEKEEPING_INTERNAL_H
#define _TIMEKEEPING_INTERNAL_H
-/*
- * timekeeping debug functions
- */
+
#include <linux/clocksource.h>
+#include <linux/spinlock.h>
#include <linux/time.h>
+/*
+ * timekeeping debug functions
+ */
#ifdef CONFIG_DEBUG_FS
extern void tk_debug_account_sleep_time(const struct timespec64 *t);
#else
@@ -31,4 +33,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
}
#endif
+/* Semi public for serialization of non timekeeper VDSO updates. */
+extern raw_spinlock_t timekeeper_lock;
+
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ae5029f984a8..a16764b0116e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2017,6 +2017,7 @@ static void __init init_timer_cpus(void)
void __init init_timers(void)
{
init_timer_cpus();
+ posix_cputimers_init_work();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 54ce6eb2ca36..88e6b8ed6ca5 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -13,6 +13,8 @@
#include <vdso/helpers.h>
#include <vdso/vsyscall.h>
+#include "timekeeping_internal.h"
+
static inline void update_vdso_data(struct vdso_data *vdata,
struct timekeeper *tk)
{
@@ -127,3 +129,42 @@ void update_vsyscall_tz(void)
__arch_sync_vdso_data(vdata);
}
+
+/**
+ * vdso_update_begin - Start of a VDSO update section
+ *
+ * Allows architecture code to safely update the architecture specific VDSO
+ * data. Disables interrupts, acquires timekeeper lock to serialize against
+ * concurrent updates from timekeeping and invalidates the VDSO data
+ * sequence counter to prevent concurrent readers from accessing
+ * inconsistent data.
+ *
+ * Returns: Saved interrupt flags which need to be handed in to
+ * vdso_update_end().
+ */
+unsigned long vdso_update_begin(void)
+{
+ struct vdso_data *vdata = __arch_get_k_vdso_data();
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ vdso_write_begin(vdata);
+ return flags;
+}
+
+/**
+ * vdso_update_end - End of a VDSO update section
+ * @flags: Interrupt flags as returned from vdso_update_begin()
+ *
+ * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data
+ * synchronization if the architecture requires it, drops timekeeper lock
+ * and restores interrupt flags.
+ */
+void vdso_update_end(unsigned long flags)
+{
+ struct vdso_data *vdata = __arch_get_k_vdso_data();
+
+ vdso_write_end(vdata);
+ __arch_sync_vdso_data(vdata);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cb91ef902cc4..a8d4f253ed77 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -383,7 +383,7 @@ static DEFINE_RAW_SPINLOCK(trace_printk_lock);
#define BPF_TRACE_PRINTK_SIZE 1024
-static inline __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...)
+static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...)
{
static char buf[BPF_TRACE_PRINTK_SIZE];
unsigned long flags;
diff --git a/kernel/umh.c b/kernel/umh.c
index a25433f9cd9a..fcf3ee803630 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -119,37 +119,16 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
{
pid_t pid;
- /* If SIGCLD is ignored kernel_wait4 won't populate the status. */
+ /* If SIGCLD is ignored do_wait won't populate the status. */
kernel_sigaction(SIGCHLD, SIG_DFL);
pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
- if (pid < 0) {
+ if (pid < 0)
sub_info->retval = pid;
- } else {
- int ret = -ECHILD;
- /*
- * Normally it is bogus to call wait4() from in-kernel because
- * wait4() wants to write the exit code to a userspace address.
- * But call_usermodehelper_exec_sync() always runs as kernel
- * thread (workqueue) and put_user() to a kernel address works
- * OK for kernel threads, due to their having an mm_segment_t
- * which spans the entire address space.
- *
- * Thus the __user pointer cast is valid here.
- */
- kernel_wait4(pid, (int __user *)&ret, 0, NULL);
-
- /*
- * If ret is 0, either call_usermodehelper_exec_async failed and
- * the real error code is already in sub_info->retval or
- * sub_info->retval is 0 anyway, so don't mess with it then.
- */
- if (ret)
- sub_info->retval = ret;
- }
+ else
+ kernel_wait(pid, &sub_info->retval);
/* Restore default kernel sig handler */
kernel_sigaction(SIGCHLD, SIG_IGN);
-
umh_complete(sub_info);
}