summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/auditsc.c28
-rw-r--r--kernel/bpf/Makefile1
-rw-r--r--kernel/bpf/bpf_local_storage.c8
-rw-r--r--kernel/bpf/bpf_lsm.c1
-rw-r--r--kernel/bpf/btf.c399
-rw-r--r--kernel/bpf/core.c28
-rw-r--r--kernel/bpf/cpumap.c2
-rw-r--r--kernel/bpf/cpumask.c479
-rw-r--r--kernel/bpf/devmap.c16
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/helpers.c203
-rw-r--r--kernel/bpf/inode.c8
-rw-r--r--kernel/bpf/memalloc.c7
-rw-r--r--kernel/bpf/offload.c419
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c6
-rw-r--r--kernel/bpf/preload/iterators/Makefile12
-rw-r--r--kernel/bpf/preload/iterators/README5
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel-big-endian.h419
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel-little-endian.h (renamed from kernel/bpf/preload/iterators/iterators.lskel.h)0
-rw-r--r--kernel/bpf/ringbuf.c4
-rw-r--r--kernel/bpf/syscall.c110
-rw-r--r--kernel/bpf/verifier.c1318
-rw-r--r--kernel/capability.c114
-rw-r--r--kernel/cgroup/cgroup.c2
-rw-r--r--kernel/cgroup/cpuset.c63
-rw-r--r--kernel/cgroup/rstat.c4
-rw-r--r--kernel/compat.c2
-rw-r--r--kernel/context_tracking.c12
-rw-r--r--kernel/cpu_pm.c9
-rw-r--r--kernel/crash_core.c4
-rw-r--r--kernel/dma/swiotlb.c11
-rw-r--r--kernel/events/core.c255
-rw-r--r--kernel/events/uprobes.c5
-rw-r--r--kernel/exit.c16
-rw-r--r--kernel/fail_function.c5
-rw-r--r--kernel/fork.c35
-rwxr-xr-xkernel/gen_kheaders.sh2
-rw-r--r--kernel/hung_task.c2
-rw-r--r--kernel/irq/Kconfig5
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/affinity.c405
-rw-r--r--kernel/irq/ipi-mux.c206
-rw-r--r--kernel/irq/ipi.c8
-rw-r--r--kernel/irq/irqdesc.c4
-rw-r--r--kernel/irq/irqdomain.c452
-rw-r--r--kernel/irq/manage.c5
-rw-r--r--kernel/irq/msi.c70
-rw-r--r--kernel/kcov.c2
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kexec_core.c97
-rw-r--r--kernel/kexec_file.c11
-rw-r--r--kernel/kprobes.c27
-rw-r--r--kernel/ksysfs.c9
-rw-r--r--kernel/kthread.c5
-rw-r--r--kernel/livepatch/core.c72
-rw-r--r--kernel/locking/lockdep.c3
-rw-r--r--kernel/locking/locktorture.c101
-rw-r--r--kernel/locking/qspinlock.c4
-rw-r--r--kernel/locking/rtmutex.c5
-rw-r--r--kernel/locking/rwsem.c87
-rw-r--r--kernel/module/kallsyms.c13
-rw-r--r--kernel/module/main.c29
-rw-r--r--kernel/notifier.c3
-rw-r--r--kernel/panic.c49
-rw-r--r--kernel/params.c3
-rw-r--r--kernel/pid_namespace.c22
-rw-r--r--kernel/pid_sysctl.h60
-rw-r--r--kernel/power/Kconfig1
-rw-r--r--kernel/power/energy_model.c5
-rw-r--r--kernel/power/swap.c16
-rw-r--r--kernel/printk/index.c2
-rw-r--r--kernel/printk/internal.h45
-rw-r--r--kernel/printk/printk.c310
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcu/Kconfig.debug15
-rw-r--r--kernel/rcu/rcu.h8
-rw-r--r--kernel/rcu/rcu_segcblist.c2
-rw-r--r--kernel/rcu/rcu_segcblist.h2
-rw-r--r--kernel/rcu/rcutorture.c12
-rw-r--r--kernel/rcu/refscale.c250
-rw-r--r--kernel/rcu/srcutree.c98
-rw-r--r--kernel/rcu/tasks.h85
-rw-r--r--kernel/rcu/tiny.c9
-rw-r--r--kernel/rcu/tree.c657
-rw-r--r--kernel/rcu/tree.h19
-rw-r--r--kernel/rcu/tree_exp.h43
-rw-r--r--kernel/rcu/tree_stall.h37
-rw-r--r--kernel/rcu/update.c49
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/resource.c14
-rw-r--r--kernel/rseq.c65
-rw-r--r--kernel/sched/clock.c27
-rw-r--r--kernel/sched/core.c158
-rw-r--r--kernel/sched/cpufreq_schedutil.c45
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/sched/deadline.c42
-rw-r--r--kernel/sched/fair.c429
-rw-r--r--kernel/sched/idle.c47
-rw-r--r--kernel/sched/membarrier.c39
-rw-r--r--kernel/sched/psi.c7
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h107
-rw-r--r--kernel/sched/topology.c99
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/sys.c33
-rw-r--r--kernel/sysctl.c43
-rw-r--r--kernel/time/Kconfig6
-rw-r--r--kernel/time/alarmtimer.c33
-rw-r--r--kernel/time/clocksource.c72
-rw-r--r--kernel/time/hrtimer.c18
-rw-r--r--kernel/time/posix-cpu-timers.c13
-rw-r--r--kernel/time/posix-stubs.c2
-rw-r--r--kernel/time/posix-timers.c2
-rw-r--r--kernel/time/test_udelay.c2
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c29
-rw-r--r--kernel/time/tick-broadcast.c6
-rw-r--r--kernel/torture.c4
-rw-r--r--kernel/trace/Kconfig7
-rw-r--r--kernel/trace/blktrace.c10
-rw-r--r--kernel/trace/bpf_trace.c166
-rw-r--r--kernel/trace/ftrace.c111
-rw-r--r--kernel/trace/ring_buffer.c42
-rw-r--r--kernel/trace/trace.c10
-rw-r--r--kernel/trace/trace.h3
-rw-r--r--kernel/trace/trace_eprobe.c95
-rw-r--r--kernel/trace/trace_events.c39
-rw-r--r--kernel/trace/trace_events_synth.c6
-rw-r--r--kernel/trace/trace_export.c3
-rw-r--r--kernel/trace/trace_kprobe.c72
-rw-r--r--kernel/trace/trace_preemptirq.c61
-rw-r--r--kernel/trace/trace_probe.c29
-rw-r--r--kernel/trace/trace_probe.h3
-rw-r--r--kernel/trace/trace_probe_kernel.h30
-rw-r--r--kernel/trace/trace_probe_tmpl.h48
-rw-r--r--kernel/trace/trace_uprobe.c13
-rw-r--r--kernel/umh.c63
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/watch_queue.c1
-rw-r--r--kernel/workqueue.c280
140 files changed, 6864 insertions, 3034 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 547c88be8a28..addeed3df15d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -64,6 +64,7 @@
#include <uapi/linux/limits.h>
#include <uapi/linux/netfilter/nf_tables.h>
#include <uapi/linux/openat2.h> // struct open_how
+#include <uapi/linux/fanotify.h>
#include "audit.h"
@@ -1294,15 +1295,11 @@ out:
static void audit_log_cap(struct audit_buffer *ab, char *prefix,
kernel_cap_t *cap)
{
- int i;
-
if (cap_isclear(*cap)) {
audit_log_format(ab, " %s=0", prefix);
return;
}
- audit_log_format(ab, " %s=", prefix);
- CAP_FOR_EACH_U32(i)
- audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
+ audit_log_format(ab, " %s=%016llx", prefix, cap->val);
}
static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
@@ -2252,7 +2249,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
if (!dentry)
return 0;
- rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps);
+ rc = get_vfs_caps_from_disk(&nop_mnt_idmap, dentry, &caps);
if (rc)
return rc;
@@ -2807,7 +2804,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- get_vfs_caps_from_disk(&init_user_ns,
+ get_vfs_caps_from_disk(&nop_mnt_idmap,
bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
@@ -2877,10 +2874,21 @@ void __audit_log_kern_module(char *name)
context->type = AUDIT_KERN_MODULE;
}
-void __audit_fanotify(unsigned int response)
+void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{
- audit_log(audit_context(), GFP_KERNEL,
- AUDIT_FANOTIFY, "resp=%u", response);
+ /* {subj,obj}_trust values are {0,1,2}: no,yes,unknown */
+ switch (friar->hdr.type) {
+ case FAN_RESPONSE_INFO_NONE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=0 subj_trust=2 obj_trust=2",
+ response, FAN_RESPONSE_INFO_NONE);
+ break;
+ case FAN_RESPONSE_INFO_AUDIT_RULE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=%X subj_trust=%u obj_trust=%u",
+ response, friar->hdr.type, friar->rule_number,
+ friar->subj_trust, friar->obj_trust);
+ }
}
void __audit_tk_injoffset(struct timespec64 offset)
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 3a12e6b400a2..02242614dcc7 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
endif
ifeq ($(CONFIG_BPF_JIT),y)
obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
obj-$(CONFIG_BPF_PRELOAD) += preload/
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b39a46e8fb08..35f4138a54dc 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -568,8 +568,8 @@ static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_att
nbuckets = max_t(u32, 2, nbuckets);
smap->bucket_log = ilog2(nbuckets);
- smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
- GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
+ smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
+ nbuckets, GFP_USER | __GFP_NOWARN);
if (!smap->buckets) {
bpf_map_area_free(smap);
return ERR_PTR(-ENOMEM);
@@ -580,8 +580,8 @@ static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_att
raw_spin_lock_init(&smap->buckets[i].lock);
}
- smap->elem_size =
- sizeof(struct bpf_local_storage_elem) + attr->value_size;
+ smap->elem_size = offsetof(struct bpf_local_storage_elem,
+ sdata.data[attr->value_size]);
return smap;
}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index a4a41ee3e80b..e14c822f8911 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -51,7 +51,6 @@ BTF_SET_END(bpf_lsm_current_hooks)
*/
BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
#ifdef CONFIG_SECURITY_NETWORK
-BTF_ID(func, bpf_lsm_socket_sock_rcv_skb)
BTF_ID(func, bpf_lsm_sock_graft)
BTF_ID(func, bpf_lsm_inet_csk_clone)
BTF_ID(func, bpf_lsm_inet_conn_established)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index f7dd8af06413..73780748404c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -337,6 +337,12 @@ const char *btf_type_str(const struct btf_type *t)
#define BTF_SHOW_NAME_SIZE 80
/*
+ * The suffix of a type that indicates it cannot alias another type when
+ * comparing BTF IDs for kfunc invocations.
+ */
+#define NOCAST_ALIAS_SUFFIX "___init"
+
+/*
* Common data to all BTF show operations. Private show functions can add
* their own data to a structure containing a struct btf_show and consult it
* in the show callback. See btf_type_show() below.
@@ -1397,12 +1403,18 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
if (!bpf_verifier_log_needed(log))
return;
- /* btf verifier prints all types it is processing via
- * btf_verifier_log_type(..., fmt = NULL).
- * Skip those prints for in-kernel BTF verification.
- */
- if (log->level == BPF_LOG_KERNEL && !fmt)
- return;
+ if (log->level == BPF_LOG_KERNEL) {
+ /* btf verifier prints all types it is processing via
+ * btf_verifier_log_type(..., fmt = NULL).
+ * Skip those prints for in-kernel BTF verification.
+ */
+ if (!fmt)
+ return;
+
+ /* Skip logging when loading module BTF with mismatches permitted */
+ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ return;
+ }
__btf_verifier_log(log, "[%u] %s %s%s",
env->log_type_id,
@@ -1441,8 +1453,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
if (!bpf_verifier_log_needed(log))
return;
- if (log->level == BPF_LOG_KERNEL && !fmt)
- return;
+ if (log->level == BPF_LOG_KERNEL) {
+ if (!fmt)
+ return;
+
+ /* Skip logging when loading module BTF with mismatches permitted */
+ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ return;
+ }
+
/* The CHECK_META phase already did a btf dump.
*
* If member is logged again, it must hit an error in
@@ -3228,7 +3247,7 @@ struct btf_field_info {
struct {
const char *node_name;
u32 value_btf_id;
- } list_head;
+ } graph_root;
};
};
@@ -3305,12 +3324,14 @@ static const char *btf_find_decl_tag_value(const struct btf *btf,
return NULL;
}
-static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt,
- const struct btf_type *t, int comp_idx,
- u32 off, int sz, struct btf_field_info *info)
+static int
+btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
+ const struct btf_type *t, int comp_idx, u32 off,
+ int sz, struct btf_field_info *info,
+ enum btf_field_type head_type)
{
+ const char *node_field_name;
const char *value_type;
- const char *list_node;
s32 id;
if (!__btf_type_is_struct(t))
@@ -3320,26 +3341,32 @@ static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt,
value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
if (!value_type)
return -EINVAL;
- list_node = strstr(value_type, ":");
- if (!list_node)
+ node_field_name = strstr(value_type, ":");
+ if (!node_field_name)
return -EINVAL;
- value_type = kstrndup(value_type, list_node - value_type, GFP_KERNEL | __GFP_NOWARN);
+ value_type = kstrndup(value_type, node_field_name - value_type, GFP_KERNEL | __GFP_NOWARN);
if (!value_type)
return -ENOMEM;
id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
kfree(value_type);
if (id < 0)
return id;
- list_node++;
- if (str_is_empty(list_node))
+ node_field_name++;
+ if (str_is_empty(node_field_name))
return -EINVAL;
- info->type = BPF_LIST_HEAD;
+ info->type = head_type;
info->off = off;
- info->list_head.value_btf_id = id;
- info->list_head.node_name = list_node;
+ info->graph_root.value_btf_id = id;
+ info->graph_root.node_name = node_field_name;
return BTF_FIELD_FOUND;
}
+#define field_mask_test_name(field_type, field_type_str) \
+ if (field_mask & field_type && !strcmp(name, field_type_str)) { \
+ type = field_type; \
+ goto end; \
+ }
+
static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
int *align, int *sz)
{
@@ -3363,18 +3390,11 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
goto end;
}
}
- if (field_mask & BPF_LIST_HEAD) {
- if (!strcmp(name, "bpf_list_head")) {
- type = BPF_LIST_HEAD;
- goto end;
- }
- }
- if (field_mask & BPF_LIST_NODE) {
- if (!strcmp(name, "bpf_list_node")) {
- type = BPF_LIST_NODE;
- goto end;
- }
- }
+ field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head");
+ field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
+ field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
+ field_mask_test_name(BPF_RB_NODE, "bpf_rb_node");
+
/* Only return BPF_KPTR when all other types with matchable names fail */
if (field_mask & BPF_KPTR) {
type = BPF_KPTR_REF;
@@ -3387,6 +3407,8 @@ end:
return type;
}
+#undef field_mask_test_name
+
static int btf_find_struct_field(const struct btf *btf,
const struct btf_type *t, u32 field_mask,
struct btf_field_info *info, int info_cnt)
@@ -3419,6 +3441,7 @@ static int btf_find_struct_field(const struct btf *btf,
case BPF_SPIN_LOCK:
case BPF_TIMER:
case BPF_LIST_NODE:
+ case BPF_RB_NODE:
ret = btf_find_struct(btf, member_type, off, sz, field_type,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
@@ -3432,8 +3455,11 @@ static int btf_find_struct_field(const struct btf *btf,
return ret;
break;
case BPF_LIST_HEAD:
- ret = btf_find_list_head(btf, t, member_type, i, off, sz,
- idx < info_cnt ? &info[idx] : &tmp);
+ case BPF_RB_ROOT:
+ ret = btf_find_graph_root(btf, t, member_type,
+ i, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp,
+ field_type);
if (ret < 0)
return ret;
break;
@@ -3480,6 +3506,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
case BPF_SPIN_LOCK:
case BPF_TIMER:
case BPF_LIST_NODE:
+ case BPF_RB_NODE:
ret = btf_find_struct(btf, var_type, off, sz, field_type,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
@@ -3493,8 +3520,11 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
return ret;
break;
case BPF_LIST_HEAD:
- ret = btf_find_list_head(btf, var, var_type, -1, off, sz,
- idx < info_cnt ? &info[idx] : &tmp);
+ case BPF_RB_ROOT:
+ ret = btf_find_graph_root(btf, var, var_type,
+ -1, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp,
+ field_type);
if (ret < 0)
return ret;
break;
@@ -3596,21 +3626,25 @@ end_btf:
return ret;
}
-static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
- struct btf_field_info *info)
+static int btf_parse_graph_root(const struct btf *btf,
+ struct btf_field *field,
+ struct btf_field_info *info,
+ const char *node_type_name,
+ size_t node_type_align)
{
const struct btf_type *t, *n = NULL;
const struct btf_member *member;
u32 offset;
int i;
- t = btf_type_by_id(btf, info->list_head.value_btf_id);
+ t = btf_type_by_id(btf, info->graph_root.value_btf_id);
/* We've already checked that value_btf_id is a struct type. We
* just need to figure out the offset of the list_node, and
* verify its type.
*/
for_each_member(i, t, member) {
- if (strcmp(info->list_head.node_name, __btf_name_by_offset(btf, member->name_off)))
+ if (strcmp(info->graph_root.node_name,
+ __btf_name_by_offset(btf, member->name_off)))
continue;
/* Invalid BTF, two members with same name */
if (n)
@@ -3618,24 +3652,38 @@ static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
n = btf_type_by_id(btf, member->type);
if (!__btf_type_is_struct(n))
return -EINVAL;
- if (strcmp("bpf_list_node", __btf_name_by_offset(btf, n->name_off)))
+ if (strcmp(node_type_name, __btf_name_by_offset(btf, n->name_off)))
return -EINVAL;
offset = __btf_member_bit_offset(n, member);
if (offset % 8)
return -EINVAL;
offset /= 8;
- if (offset % __alignof__(struct bpf_list_node))
+ if (offset % node_type_align)
return -EINVAL;
- field->list_head.btf = (struct btf *)btf;
- field->list_head.value_btf_id = info->list_head.value_btf_id;
- field->list_head.node_offset = offset;
+ field->graph_root.btf = (struct btf *)btf;
+ field->graph_root.value_btf_id = info->graph_root.value_btf_id;
+ field->graph_root.node_offset = offset;
}
if (!n)
return -ENOENT;
return 0;
}
+static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
+{
+ return btf_parse_graph_root(btf, field, info, "bpf_list_node",
+ __alignof__(struct bpf_list_node));
+}
+
+static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
+{
+ return btf_parse_graph_root(btf, field, info, "bpf_rb_node",
+ __alignof__(struct bpf_rb_node));
+}
+
struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
u32 field_mask, u32 value_size)
{
@@ -3698,7 +3746,13 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
if (ret < 0)
goto end;
break;
+ case BPF_RB_ROOT:
+ ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
case BPF_LIST_NODE:
+ case BPF_RB_NODE:
break;
default:
ret = -EFAULT;
@@ -3707,8 +3761,33 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
rec->cnt++;
}
- /* bpf_list_head requires bpf_spin_lock */
- if (btf_record_has_field(rec, BPF_LIST_HEAD) && rec->spin_lock_off < 0) {
+ /* bpf_{list_head, rb_node} require bpf_spin_lock */
+ if ((btf_record_has_field(rec, BPF_LIST_HEAD) ||
+ btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ /* need collection identity for non-owning refs before allowing this
+ *
+ * Consider a node type w/ both list and rb_node fields:
+ * struct node {
+ * struct bpf_list_node l;
+ * struct bpf_rb_node r;
+ * }
+ *
+ * Used like so:
+ * struct node *n = bpf_obj_new(....);
+ * bpf_list_push_front(&list_head, &n->l);
+ * bpf_rbtree_remove(&rb_root, &n->r);
+ *
+ * It should not be possible to rbtree_remove the node since it hasn't
+ * been added to a tree. But push_front converts n to a non-owning
+ * reference, and rbtree_remove accepts the non-owning reference to
+ * a type w/ bpf_rb_node field.
+ */
+ if (btf_record_has_field(rec, BPF_LIST_NODE) &&
+ btf_record_has_field(rec, BPF_RB_NODE)) {
ret = -EINVAL;
goto end;
}
@@ -3719,62 +3798,76 @@ end:
return ERR_PTR(ret);
}
+#define GRAPH_ROOT_MASK (BPF_LIST_HEAD | BPF_RB_ROOT)
+#define GRAPH_NODE_MASK (BPF_LIST_NODE | BPF_RB_NODE)
+
int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
{
int i;
- /* There are two owning types, kptr_ref and bpf_list_head. The former
- * only supports storing kernel types, which can never store references
- * to program allocated local types, atleast not yet. Hence we only need
- * to ensure that bpf_list_head ownership does not form cycles.
+ /* There are three types that signify ownership of some other type:
+ * kptr_ref, bpf_list_head, bpf_rb_root.
+ * kptr_ref only supports storing kernel types, which can't store
+ * references to program allocated local types.
+ *
+ * Hence we only need to ensure that bpf_{list_head,rb_root} ownership
+ * does not form cycles.
*/
- if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_LIST_HEAD))
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & GRAPH_ROOT_MASK))
return 0;
for (i = 0; i < rec->cnt; i++) {
struct btf_struct_meta *meta;
u32 btf_id;
- if (!(rec->fields[i].type & BPF_LIST_HEAD))
+ if (!(rec->fields[i].type & GRAPH_ROOT_MASK))
continue;
- btf_id = rec->fields[i].list_head.value_btf_id;
+ btf_id = rec->fields[i].graph_root.value_btf_id;
meta = btf_find_struct_meta(btf, btf_id);
if (!meta)
return -EFAULT;
- rec->fields[i].list_head.value_rec = meta->record;
+ rec->fields[i].graph_root.value_rec = meta->record;
- if (!(rec->field_mask & BPF_LIST_NODE))
+ /* We need to set value_rec for all root types, but no need
+ * to check ownership cycle for a type unless it's also a
+ * node type.
+ */
+ if (!(rec->field_mask & GRAPH_NODE_MASK))
continue;
/* We need to ensure ownership acyclicity among all types. The
* proper way to do it would be to topologically sort all BTF
* IDs based on the ownership edges, since there can be multiple
- * bpf_list_head in a type. Instead, we use the following
- * reasoning:
+ * bpf_{list_head,rb_node} in a type. Instead, we use the
+ * following resaoning:
*
* - A type can only be owned by another type in user BTF if it
- * has a bpf_list_node.
+ * has a bpf_{list,rb}_node. Let's call these node types.
* - A type can only _own_ another type in user BTF if it has a
- * bpf_list_head.
+ * bpf_{list_head,rb_root}. Let's call these root types.
*
- * We ensure that if a type has both bpf_list_head and
- * bpf_list_node, its element types cannot be owning types.
+ * We ensure that if a type is both a root and node, its
+ * element types cannot be root types.
*
* To ensure acyclicity:
*
- * When A only has bpf_list_head, ownership chain can be:
+ * When A is an root type but not a node, its ownership
+ * chain can be:
* A -> B -> C
* Where:
- * - B has both bpf_list_head and bpf_list_node.
- * - C only has bpf_list_node.
+ * - A is an root, e.g. has bpf_rb_root.
+ * - B is both a root and node, e.g. has bpf_rb_node and
+ * bpf_list_head.
+ * - C is only an root, e.g. has bpf_list_node
*
- * When A has both bpf_list_head and bpf_list_node, some other
- * type already owns it in the BTF domain, hence it can not own
- * another owning type through any of the bpf_list_head edges.
+ * When A is both a root and node, some other type already
+ * owns it in the BTF domain, hence it can not own
+ * another root type through any of the ownership edges.
* A -> B
* Where:
- * - B only has bpf_list_node.
+ * - A is both an root and node.
+ * - B is only an node.
*/
- if (meta->record->field_mask & BPF_LIST_HEAD)
+ if (meta->record->field_mask & GRAPH_ROOT_MASK)
return -ELOOP;
}
return 0;
@@ -4476,6 +4569,7 @@ static int btf_datasec_resolve(struct btf_verifier_env *env,
struct btf *btf = env->btf;
u16 i;
+ env->resolve_mode = RESOLVE_TBD;
for_each_vsi_from(i, v->next_member, v->t, vsi) {
u32 var_type_id = vsi->type, type_id, type_size = 0;
const struct btf_type *var_type = btf_type_by_id(env->btf,
@@ -5236,6 +5330,8 @@ static const char *alloc_obj_fields[] = {
"bpf_spin_lock",
"bpf_list_head",
"bpf_list_node",
+ "bpf_rb_root",
+ "bpf_rb_node",
};
static struct btf_struct_metas *
@@ -5309,7 +5405,8 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
type = &tab->types[tab->cnt];
type->btf_id = i;
- record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE, t->size);
+ record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
+ BPF_RB_ROOT | BPF_RB_NODE, t->size);
/* The record cannot be unset, treat it as an error if so */
if (IS_ERR_OR_NULL(record)) {
ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
@@ -5573,6 +5670,7 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
if (!ctx_struct)
/* should not happen */
return NULL;
+again:
ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off);
if (!ctx_tname) {
/* should not happen */
@@ -5586,8 +5684,16 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
* int socket_filter_bpf_prog(struct __sk_buff *skb)
* { // no fields of skb are ever used }
*/
- if (strcmp(ctx_tname, tname))
- return NULL;
+ if (strcmp(ctx_tname, tname)) {
+ /* bpf_user_pt_regs_t is a typedef, so resolve it to
+ * underlying struct and check name again
+ */
+ if (!btf_type_is_modifier(ctx_struct))
+ return NULL;
+ while (btf_type_is_modifier(ctx_struct))
+ ctx_struct = btf_type_by_id(btf_vmlinux, ctx_struct->type);
+ goto again;
+ }
return ctx_type;
}
@@ -6433,6 +6539,18 @@ static int __get_type_size(struct btf *btf, u32 btf_id,
return -EINVAL;
}
+static u8 __get_type_fmodel_flags(const struct btf_type *t)
+{
+ u8 flags = 0;
+
+ if (__btf_type_is_struct(t))
+ flags |= BTF_FMODEL_STRUCT_ARG;
+ if (btf_type_is_signed_int(t))
+ flags |= BTF_FMODEL_SIGNED_ARG;
+
+ return flags;
+}
+
int btf_distill_func_proto(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *func,
@@ -6453,6 +6571,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
m->arg_flags[i] = 0;
}
m->ret_size = 8;
+ m->ret_flags = 0;
m->nr_args = MAX_BPF_FUNC_REG_ARGS;
return 0;
}
@@ -6472,6 +6591,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
m->ret_size = ret;
+ m->ret_flags = __get_type_fmodel_flags(t);
for (i = 0; i < nargs; i++) {
if (i == nargs - 1 && args[i].type == 0) {
@@ -6496,7 +6616,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
m->arg_size[i] = ret;
- m->arg_flags[i] = __btf_type_is_struct(t) ? BTF_FMODEL_STRUCT_ARG : 0;
+ m->arg_flags[i] = __get_type_fmodel_flags(t);
}
m->nr_args = nargs;
return 0;
@@ -7260,11 +7380,14 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
}
btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size);
if (IS_ERR(btf)) {
- pr_warn("failed to validate module [%s] BTF: %ld\n",
- mod->name, PTR_ERR(btf));
kfree(btf_mod);
- if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) {
+ pr_warn("failed to validate module [%s] BTF: %ld\n",
+ mod->name, PTR_ERR(btf));
err = PTR_ERR(btf);
+ } else {
+ pr_warn_once("Kernel module BTF mismatch detected, BTF debug info may be unavailable for some modules\n");
+ }
goto out;
}
err = btf_alloc_id(btf);
@@ -7782,9 +7905,9 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL);
- return 0;
end:
- btf_free_dtor_kfunc_tab(btf);
+ if (ret)
+ btf_free_dtor_kfunc_tab(btf);
btf_put(btf);
return ret;
}
@@ -8210,3 +8333,119 @@ out:
}
return err;
}
+
+bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off)
+{
+ struct btf *btf = reg->btf;
+ const struct btf_type *walk_type, *safe_type;
+ const char *tname;
+ char safe_tname[64];
+ long ret, safe_id;
+ const struct btf_member *member, *m_walk = NULL;
+ u32 i;
+ const char *walk_name;
+
+ walk_type = btf_type_by_id(btf, reg->btf_id);
+ if (!walk_type)
+ return false;
+
+ tname = btf_name_by_offset(btf, walk_type->name_off);
+
+ ret = snprintf(safe_tname, sizeof(safe_tname), "%s__safe_fields", tname);
+ if (ret < 0)
+ return false;
+
+ safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info));
+ if (safe_id < 0)
+ return false;
+
+ safe_type = btf_type_by_id(btf, safe_id);
+ if (!safe_type)
+ return false;
+
+ for_each_member(i, walk_type, member) {
+ u32 moff;
+
+ /* We're looking for the PTR_TO_BTF_ID member in the struct
+ * type we're walking which matches the specified offset.
+ * Below, we'll iterate over the fields in the safe variant of
+ * the struct and see if any of them has a matching type /
+ * name.
+ */
+ moff = __btf_member_bit_offset(walk_type, member) / 8;
+ if (off == moff) {
+ m_walk = member;
+ break;
+ }
+ }
+ if (m_walk == NULL)
+ return false;
+
+ walk_name = __btf_name_by_offset(btf, m_walk->name_off);
+ for_each_member(i, safe_type, member) {
+ const char *m_name = __btf_name_by_offset(btf, member->name_off);
+
+ /* If we match on both type and name, the field is considered trusted. */
+ if (m_walk->type == member->type && !strcmp(walk_name, m_name))
+ return true;
+ }
+
+ return false;
+}
+
+bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
+ const struct btf *reg_btf, u32 reg_id,
+ const struct btf *arg_btf, u32 arg_id)
+{
+ const char *reg_name, *arg_name, *search_needle;
+ const struct btf_type *reg_type, *arg_type;
+ int reg_len, arg_len, cmp_len;
+ size_t pattern_len = sizeof(NOCAST_ALIAS_SUFFIX) - sizeof(char);
+
+ reg_type = btf_type_by_id(reg_btf, reg_id);
+ if (!reg_type)
+ return false;
+
+ arg_type = btf_type_by_id(arg_btf, arg_id);
+ if (!arg_type)
+ return false;
+
+ reg_name = btf_name_by_offset(reg_btf, reg_type->name_off);
+ arg_name = btf_name_by_offset(arg_btf, arg_type->name_off);
+
+ reg_len = strlen(reg_name);
+ arg_len = strlen(arg_name);
+
+ /* Exactly one of the two type names may be suffixed with ___init, so
+ * if the strings are the same size, they can't possibly be no-cast
+ * aliases of one another. If you have two of the same type names, e.g.
+ * they're both nf_conn___init, it would be improper to return true
+ * because they are _not_ no-cast aliases, they are the same type.
+ */
+ if (reg_len == arg_len)
+ return false;
+
+ /* Either of the two names must be the other name, suffixed with ___init. */
+ if ((reg_len != arg_len + pattern_len) &&
+ (arg_len != reg_len + pattern_len))
+ return false;
+
+ if (reg_len < arg_len) {
+ search_needle = strstr(arg_name, NOCAST_ALIAS_SUFFIX);
+ cmp_len = reg_len;
+ } else {
+ search_needle = strstr(reg_name, NOCAST_ALIAS_SUFFIX);
+ cmp_len = arg_len;
+ }
+
+ if (!search_needle)
+ return false;
+
+ /* ___init suffix must come at the end of the name */
+ if (*(search_needle + pattern_len) != '\0')
+ return false;
+
+ return !strncmp(reg_name, arg_name, cmp_len);
+}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba3fff17e2f9..b297e9f60ca1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -34,7 +34,9 @@
#include <linux/log2.h>
#include <linux/bpf_verifier.h>
#include <linux/nodemask.h>
+#include <linux/nospec.h>
#include <linux/bpf_mem_alloc.h>
+#include <linux/memcontrol.h>
#include <asm/barrier.h>
#include <asm/unaligned.h>
@@ -87,7 +89,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog_aux *aux;
struct bpf_prog *fp;
@@ -96,12 +98,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
if (fp == NULL)
return NULL;
- aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags);
+ aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
if (aux == NULL) {
vfree(fp);
return NULL;
}
- fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags);
+ fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
if (!fp->active) {
vfree(fp);
kfree(aux);
@@ -126,7 +128,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog *prog;
int cpu;
@@ -159,7 +161,7 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
sizeof(*prog->aux->jited_linfo),
- GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN));
if (!prog->aux->jited_linfo)
return -ENOMEM;
@@ -234,7 +236,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog *fp;
u32 pages;
@@ -1910,9 +1912,7 @@ out:
* reuse preexisting logic from Spectre v1 mitigation that
* happens to produce the required code on x86 for v4 as well.
*/
-#ifdef CONFIG_X86
barrier_nospec();
-#endif
CONT;
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
@@ -2096,6 +2096,14 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
if (fp->kprobe_override)
return false;
+ /* XDP programs inserted into maps are not guaranteed to run on
+ * a particular netdev (and can run outside driver context entirely
+ * in the case of devmap and cpumap). Until device checks
+ * are implemented, prohibit adding dev-bound programs to program maps.
+ */
+ if (bpf_prog_is_dev_bound(fp->aux))
+ return false;
+
spin_lock(&map->owner.lock);
if (!map->owner.type) {
/* There's no owner yet where we could check for
@@ -2182,7 +2190,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* valid program, which in this case would simply not
* be JITed, but falls back to the interpreter.
*/
- if (!bpf_prog_is_dev_bound(fp->aux)) {
+ if (!bpf_prog_is_offloaded(fp->aux)) {
*err = bpf_prog_alloc_jited_linfo(fp);
if (*err)
return fp;
@@ -2554,7 +2562,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
bpf_free_used_maps(aux);
bpf_free_used_btfs(aux);
if (bpf_prog_is_dev_bound(aux))
- bpf_prog_offload_destroy(aux->prog);
+ bpf_prog_dev_bound_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
if (aux->prog->has_callchain_buf)
put_callchain_buffers();
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index e0b2d016f0bf..d2110c1f6fa6 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -361,7 +361,7 @@ static int cpu_map_kthread_run(void *data)
/* Support running another XDP prog on this CPU */
nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
if (nframes) {
- m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs);
+ m = kmem_cache_alloc_bulk(skbuff_cache, gfp, nframes, skbs);
if (unlikely(m == 0)) {
for (i = 0; i < nframes; i++)
skbs[i] = NULL; /* effect: xdp_return_frame */
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
new file mode 100644
index 000000000000..52b981512a35
--- /dev/null
+++ b/kernel/bpf/cpumask.c
@@ -0,0 +1,479 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2023 Meta, Inc */
+#include <linux/bpf.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/cpumask.h>
+
+/**
+ * struct bpf_cpumask - refcounted BPF cpumask wrapper structure
+ * @cpumask: The actual cpumask embedded in the struct.
+ * @usage: Object reference counter. When the refcount goes to 0, the
+ * memory is released back to the BPF allocator, which provides
+ * RCU safety.
+ *
+ * Note that we explicitly embed a cpumask_t rather than a cpumask_var_t. This
+ * is done to avoid confusing the verifier due to the typedef of cpumask_var_t
+ * changing depending on whether CONFIG_CPUMASK_OFFSTACK is defined or not. See
+ * the details in <linux/cpumask.h>. The consequence is that this structure is
+ * likely a bit larger than it needs to be when CONFIG_CPUMASK_OFFSTACK is
+ * defined due to embedding the whole NR_CPUS-size bitmap, but the extra memory
+ * overhead is minimal. For the more typical case of CONFIG_CPUMASK_OFFSTACK
+ * not being defined, the structure is the same size regardless.
+ */
+struct bpf_cpumask {
+ cpumask_t cpumask;
+ refcount_t usage;
+};
+
+static struct bpf_mem_alloc bpf_cpumask_ma;
+
+static bool cpu_valid(u32 cpu)
+{
+ return cpu < nr_cpu_ids;
+}
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global kfuncs as their definitions will be in BTF");
+
+/**
+ * bpf_cpumask_create() - Create a mutable BPF cpumask.
+ *
+ * Allocates a cpumask that can be queried, mutated, acquired, and released by
+ * a BPF program. The cpumask returned by this function must either be embedded
+ * in a map as a kptr, or freed with bpf_cpumask_release().
+ *
+ * bpf_cpumask_create() allocates memory using the BPF memory allocator, and
+ * will not block. It may return NULL if no memory is available.
+ */
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void)
+{
+ struct bpf_cpumask *cpumask;
+
+ /* cpumask must be the first element so struct bpf_cpumask be cast to struct cpumask. */
+ BUILD_BUG_ON(offsetof(struct bpf_cpumask, cpumask) != 0);
+
+ cpumask = bpf_mem_alloc(&bpf_cpumask_ma, sizeof(*cpumask));
+ if (!cpumask)
+ return NULL;
+
+ memset(cpumask, 0, sizeof(*cpumask));
+ refcount_set(&cpumask->usage, 1);
+
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_acquire() - Acquire a reference to a BPF cpumask.
+ * @cpumask: The BPF cpumask being acquired. The cpumask must be a trusted
+ * pointer.
+ *
+ * Acquires a reference to a BPF cpumask. The cpumask returned by this function
+ * must either be embedded in a map as a kptr, or freed with
+ * bpf_cpumask_release().
+ */
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
+{
+ refcount_inc(&cpumask->usage);
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_kptr_get() - Attempt to acquire a reference to a BPF cpumask
+ * stored in a map.
+ * @cpumaskp: A pointer to a BPF cpumask map value.
+ *
+ * Attempts to acquire a reference to a BPF cpumask stored in a map value. The
+ * cpumask returned by this function must either be embedded in a map as a
+ * kptr, or freed with bpf_cpumask_release(). This function may return NULL if
+ * no BPF cpumask was found in the specified map value.
+ */
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp)
+{
+ struct bpf_cpumask *cpumask;
+
+ /* The BPF memory allocator frees memory backing its caches in an RCU
+ * callback. Thus, we can safely use RCU to ensure that the cpumask is
+ * safe to read.
+ */
+ rcu_read_lock();
+
+ cpumask = READ_ONCE(*cpumaskp);
+ if (cpumask && !refcount_inc_not_zero(&cpumask->usage))
+ cpumask = NULL;
+
+ rcu_read_unlock();
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_release() - Release a previously acquired BPF cpumask.
+ * @cpumask: The cpumask being released.
+ *
+ * Releases a previously acquired reference to a BPF cpumask. When the final
+ * reference of the BPF cpumask has been released, it is subsequently freed in
+ * an RCU callback in the BPF memory allocator.
+ */
+__bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
+{
+ if (!cpumask)
+ return;
+
+ if (refcount_dec_and_test(&cpumask->usage)) {
+ migrate_disable();
+ bpf_mem_free(&bpf_cpumask_ma, cpumask);
+ migrate_enable();
+ }
+}
+
+/**
+ * bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask
+ * pointer may be safely passed to this function.
+ */
+__bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask)
+{
+ return cpumask_first(cpumask);
+}
+
+/**
+ * bpf_cpumask_first_zero() - Get the index of the first unset bit in the
+ * cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask
+ * pointer may be safely passed to this function.
+ */
+__bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
+{
+ return cpumask_first_zero(cpumask);
+}
+
+/**
+ * bpf_cpumask_set_cpu() - Set a bit for a CPU in a BPF cpumask.
+ * @cpu: The CPU to be set in the cpumask.
+ * @cpumask: The BPF cpumask in which a bit is being set.
+ */
+__bpf_kfunc void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return;
+
+ cpumask_set_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_clear_cpu() - Clear a bit for a CPU in a BPF cpumask.
+ * @cpu: The CPU to be cleared from the cpumask.
+ * @cpumask: The BPF cpumask in which a bit is being cleared.
+ */
+__bpf_kfunc void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return;
+
+ cpumask_clear_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_cpu() - Test whether a CPU is set in a cpumask.
+ * @cpu: The CPU being queried for.
+ * @cpumask: The cpumask being queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is an invalid cpu.
+ */
+__bpf_kfunc bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_and_set_cpu() - Atomically test and set a CPU in a BPF cpumask.
+ * @cpu: The CPU being set and queried for.
+ * @cpumask: The BPF cpumask being set and queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is invalid.
+ */
+__bpf_kfunc bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_and_set_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_and_clear_cpu() - Atomically test and clear a CPU in a BPF
+ * cpumask.
+ * @cpu: The CPU being cleared and queried for.
+ * @cpumask: The BPF cpumask being cleared and queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is invalid.
+ */
+__bpf_kfunc bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_and_clear_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_setall() - Set all of the bits in a BPF cpumask.
+ * @cpumask: The BPF cpumask having all of its bits set.
+ */
+__bpf_kfunc void bpf_cpumask_setall(struct bpf_cpumask *cpumask)
+{
+ cpumask_setall((struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_clear() - Clear all of the bits in a BPF cpumask.
+ * @cpumask: The BPF cpumask being cleared.
+ */
+__bpf_kfunc void bpf_cpumask_clear(struct bpf_cpumask *cpumask)
+{
+ cpumask_clear((struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_and() - AND two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @dst has at least one bit set following the operation
+ * * false - @dst is empty following the operation
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_and(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ return cpumask_and((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_or() - OR two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc void bpf_cpumask_or(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ cpumask_or((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_xor() - XOR two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc void bpf_cpumask_xor(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ cpumask_xor((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_equal() - Check two cpumasks for equality.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @src1 and @src2 have the same bits set.
+ * * false - @src1 and @src2 differ in at least one bit.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_equal(src1, src2);
+}
+
+/**
+ * bpf_cpumask_intersects() - Check two cpumasks for overlap.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @src1 and @src2 have at least one of the same bits set.
+ * * false - @src1 and @src2 don't have any of the same bits set.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_intersects(src1, src2);
+}
+
+/**
+ * bpf_cpumask_subset() - Check if a cpumask is a subset of another.
+ * @src1: The first cpumask being checked as a subset.
+ * @src2: The second cpumask being checked as a superset.
+ *
+ * Return:
+ * * true - All of the bits of @src1 are set in @src2.
+ * * false - At least one bit in @src1 is not set in @src2.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_subset(src1, src2);
+}
+
+/**
+ * bpf_cpumask_empty() - Check if a cpumask is empty.
+ * @cpumask: The cpumask being checked.
+ *
+ * Return:
+ * * true - None of the bits in @cpumask are set.
+ * * false - At least one bit in @cpumask is set.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @cpumask.
+ */
+__bpf_kfunc bool bpf_cpumask_empty(const struct cpumask *cpumask)
+{
+ return cpumask_empty(cpumask);
+}
+
+/**
+ * bpf_cpumask_full() - Check if a cpumask has all bits set.
+ * @cpumask: The cpumask being checked.
+ *
+ * Return:
+ * * true - All of the bits in @cpumask are set.
+ * * false - At least one bit in @cpumask is cleared.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @cpumask.
+ */
+__bpf_kfunc bool bpf_cpumask_full(const struct cpumask *cpumask)
+{
+ return cpumask_full(cpumask);
+}
+
+/**
+ * bpf_cpumask_copy() - Copy the contents of a cpumask into a BPF cpumask.
+ * @dst: The BPF cpumask being copied into.
+ * @src: The cpumask being copied.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @src.
+ */
+__bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src)
+{
+ cpumask_copy((struct cpumask *)dst, src);
+}
+
+/**
+ * bpf_cpumask_any() - Return a random set CPU from a cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Return:
+ * * A random set bit within [0, num_cpus) if at least one bit is set.
+ * * >= num_cpus if no bit is set.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @src.
+ */
+__bpf_kfunc u32 bpf_cpumask_any(const struct cpumask *cpumask)
+{
+ return cpumask_any(cpumask);
+}
+
+/**
+ * bpf_cpumask_any_and() - Return a random set CPU from the AND of two
+ * cpumasks.
+ * @src1: The first cpumask.
+ * @src2: The second cpumask.
+ *
+ * Return:
+ * * A random set bit within [0, num_cpus) if at least one bit is set.
+ * * >= num_cpus if no bit is set.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_any_and(src1, src2);
+}
+
+__diag_pop();
+
+BTF_SET8_START(cpumask_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cpumask_first, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_and, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_or, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_full, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_any, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_TRUSTED_ARGS)
+BTF_SET8_END(cpumask_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set cpumask_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &cpumask_kfunc_btf_ids,
+};
+
+BTF_ID_LIST(cpumask_dtor_ids)
+BTF_ID(struct, bpf_cpumask)
+BTF_ID(func, bpf_cpumask_release)
+
+static int __init cpumask_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc cpumask_dtors[] = {
+ {
+ .btf_id = cpumask_dtor_ids[0],
+ .kfunc_btf_id = cpumask_dtor_ids[1]
+ },
+ };
+
+ ret = bpf_mem_alloc_init(&bpf_cpumask_ma, 0, false);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set);
+ return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors,
+ ARRAY_SIZE(cpumask_dtors),
+ THIS_MODULE);
+}
+
+late_initcall(cpumask_kfunc_init);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index d01e4c55b376..2675fefc6cb6 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -474,7 +474,11 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
{
int err;
- if (!dev->netdev_ops->ndo_xdp_xmit)
+ if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
+ return -EOPNOTSUPP;
+
+ if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
+ xdp_frame_has_frags(xdpf)))
return -EOPNOTSUPP;
err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
@@ -532,8 +536,14 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
{
- if (!obj ||
- !obj->dev->netdev_ops->ndo_xdp_xmit)
+ if (!obj)
+ return false;
+
+ if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
+ return false;
+
+ if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
+ xdp_frame_has_frags(xdpf)))
return false;
if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf)))
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 66bded144377..5dfcb5ad0d06 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1004,8 +1004,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
- check_and_init_map_value(&htab->map,
- l_new->key + round_up(key_size, 8));
}
memcpy(l_new->key, key, key_size);
@@ -1592,6 +1590,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
else
copy_map_value(map, value, l->key +
roundup_key_size);
+ /* Zeroing special fields in the temp buffer */
check_and_init_map_value(map, value);
}
@@ -1792,6 +1791,7 @@ again_nocopy:
true);
else
copy_map_value(map, dst_val, value);
+ /* Zeroing special fields in the temp buffer */
check_and_init_map_value(map, dst_val);
}
if (do_delete) {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index af30c6cbd65d..5b278a38ae58 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -756,19 +756,20 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary
* arguments representation.
*/
-#define MAX_BPRINTF_BUF_LEN 512
+#define MAX_BPRINTF_BIN_ARGS 512
/* Support executing three nested bprintf helper calls on a given CPU */
#define MAX_BPRINTF_NEST_LEVEL 3
struct bpf_bprintf_buffers {
- char tmp_bufs[MAX_BPRINTF_NEST_LEVEL][MAX_BPRINTF_BUF_LEN];
+ char bin_args[MAX_BPRINTF_BIN_ARGS];
+ char buf[MAX_BPRINTF_BUF];
};
-static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs);
+
+static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs);
static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
-static int try_get_fmt_tmp_buf(char **tmp_buf)
+static int try_get_buffers(struct bpf_bprintf_buffers **bufs)
{
- struct bpf_bprintf_buffers *bufs;
int nest_level;
preempt_disable();
@@ -778,18 +779,19 @@ static int try_get_fmt_tmp_buf(char **tmp_buf)
preempt_enable();
return -EBUSY;
}
- bufs = this_cpu_ptr(&bpf_bprintf_bufs);
- *tmp_buf = bufs->tmp_bufs[nest_level - 1];
+ *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
return 0;
}
-void bpf_bprintf_cleanup(void)
+void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
{
- if (this_cpu_read(bpf_bprintf_nest_level)) {
- this_cpu_dec(bpf_bprintf_nest_level);
- preempt_enable();
- }
+ if (!data->bin_args && !data->buf)
+ return;
+ if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
+ return;
+ this_cpu_dec(bpf_bprintf_nest_level);
+ preempt_enable();
}
/*
@@ -798,18 +800,20 @@ void bpf_bprintf_cleanup(void)
* Returns a negative value if fmt is an invalid format string or 0 otherwise.
*
* This can be used in two ways:
- * - Format string verification only: when bin_args is NULL
+ * - Format string verification only: when data->get_bin_args is false
* - Arguments preparation: in addition to the above verification, it writes in
- * bin_args a binary representation of arguments usable by bstr_printf where
- * pointers from BPF have been sanitized.
+ * data->bin_args a binary representation of arguments usable by bstr_printf
+ * where pointers from BPF have been sanitized.
*
* In argument preparation mode, if 0 is returned, safe temporary buffers are
* allocated and bpf_bprintf_cleanup should be called to free them after use.
*/
int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
- u32 **bin_args, u32 num_args)
+ u32 num_args, struct bpf_bprintf_data *data)
{
+ bool get_buffers = (data->get_bin_args && num_args) || data->get_buf;
char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end;
+ struct bpf_bprintf_buffers *buffers = NULL;
size_t sizeof_cur_arg, sizeof_cur_ip;
int err, i, num_spec = 0;
u64 cur_arg;
@@ -820,14 +824,19 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
return -EINVAL;
fmt_size = fmt_end - fmt;
- if (bin_args) {
- if (num_args && try_get_fmt_tmp_buf(&tmp_buf))
- return -EBUSY;
+ if (get_buffers && try_get_buffers(&buffers))
+ return -EBUSY;
- tmp_buf_end = tmp_buf + MAX_BPRINTF_BUF_LEN;
- *bin_args = (u32 *)tmp_buf;
+ if (data->get_bin_args) {
+ if (num_args)
+ tmp_buf = buffers->bin_args;
+ tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS;
+ data->bin_args = (u32 *)tmp_buf;
}
+ if (data->get_buf)
+ data->buf = buffers->buf;
+
for (i = 0; i < fmt_size; i++) {
if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
err = -EINVAL;
@@ -1021,31 +1030,33 @@ nocopy_fmt:
err = 0;
out:
if (err)
- bpf_bprintf_cleanup();
+ bpf_bprintf_cleanup(data);
return err;
}
BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
- const void *, data, u32, data_len)
+ const void *, args, u32, data_len)
{
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ };
int err, num_args;
- u32 *bin_args;
if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 ||
- (data_len && !data))
+ (data_len && !args))
return -EINVAL;
num_args = data_len / 8;
/* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
* can safely give an unbounded size.
*/
- err = bpf_bprintf_prepare(fmt, UINT_MAX, data, &bin_args, num_args);
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data);
if (err < 0)
return err;
- err = bstr_printf(str, str_size, fmt, bin_args);
+ err = bstr_printf(str, str_size, fmt, data.bin_args);
- bpf_bprintf_cleanup();
+ bpf_bprintf_cleanup(&data);
return err + 1;
}
@@ -1745,12 +1756,12 @@ unlock:
while (head != orig_head) {
void *obj = head;
- obj -= field->list_head.node_offset;
+ obj -= field->graph_root.node_offset;
head = head->next;
/* The contained type can also have resources, including a
* bpf_list_head which needs to be freed.
*/
- bpf_obj_free_fields(field->list_head.value_rec, obj);
+ bpf_obj_free_fields(field->graph_root.value_rec, obj);
/* bpf_mem_free requires migrate_disable(), since we can be
* called from map free path as well apart from BPF program (as
* part of map ops doing bpf_obj_free_fields).
@@ -1761,11 +1772,51 @@ unlock:
}
}
+/* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are
+ * 'rb_node *', so field name of rb_node within containing struct is not
+ * needed.
+ *
+ * Since bpf_rb_tree's node type has a corresponding struct btf_field with
+ * graph_root.node_offset, it's not necessary to know field name
+ * or type of node struct
+ */
+#define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \
+ for (pos = rb_first_postorder(root); \
+ pos && ({ n = rb_next_postorder(pos); 1; }); \
+ pos = n)
+
+void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
+ struct bpf_spin_lock *spin_lock)
+{
+ struct rb_root_cached orig_root, *root = rb_root;
+ struct rb_node *pos, *n;
+ void *obj;
+
+ BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root));
+ BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root));
+
+ __bpf_spin_lock_irqsave(spin_lock);
+ orig_root = *root;
+ *root = RB_ROOT_CACHED;
+ __bpf_spin_unlock_irqrestore(spin_lock);
+
+ bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
+ obj = pos;
+ obj -= field->graph_root.node_offset;
+
+ bpf_obj_free_fields(field->graph_root.value_rec, obj);
+
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, obj);
+ migrate_enable();
+ }
+}
+
__diag_push();
__diag_ignore_all("-Wmissing-prototypes",
"Global functions as their definitions will be in vmlinux BTF");
-void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
{
struct btf_struct_meta *meta = meta__ign;
u64 size = local_type_id__k;
@@ -1779,7 +1830,7 @@ void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
return p;
}
-void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
+__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
{
struct btf_struct_meta *meta = meta__ign;
void *p = p__alloc;
@@ -1800,12 +1851,12 @@ static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *hea
tail ? list_add_tail(n, h) : list_add(n, h);
}
-void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node)
+__bpf_kfunc void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node)
{
return __bpf_list_add(node, head, false);
}
-void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node)
+__bpf_kfunc void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node)
{
return __bpf_list_add(node, head, true);
}
@@ -1823,23 +1874,73 @@ static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tai
return (struct bpf_list_node *)n;
}
-struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
{
return __bpf_list_del(head, false);
}
-struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
{
return __bpf_list_del(head, true);
}
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
+ struct bpf_rb_node *node)
+{
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+ struct rb_node *n = (struct rb_node *)node;
+
+ rb_erase_cached(n, r);
+ RB_CLEAR_NODE(n);
+ return (struct bpf_rb_node *)n;
+}
+
+/* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
+ * program
+ */
+static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ void *less)
+{
+ struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
+ bpf_callback_t cb = (bpf_callback_t)less;
+ struct rb_node *parent = NULL;
+ bool leftmost = true;
+
+ while (*link) {
+ parent = *link;
+ if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node((struct rb_node *)node, parent, link);
+ rb_insert_color_cached((struct rb_node *)node,
+ (struct rb_root_cached *)root, leftmost);
+}
+
+__bpf_kfunc void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b))
+{
+ __bpf_rbtree_add(root, node, (void *)less);
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
+{
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+
+ return (struct bpf_rb_node *)rb_first_cached(r);
+}
+
/**
* bpf_task_acquire - Acquire a reference to a task. A task acquired by this
* kfunc which is not stored in a map as a kptr, must be released by calling
* bpf_task_release().
* @p: The task on which a reference is being acquired.
*/
-struct task_struct *bpf_task_acquire(struct task_struct *p)
+__bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
{
return get_task_struct(p);
}
@@ -1850,7 +1951,7 @@ struct task_struct *bpf_task_acquire(struct task_struct *p)
* released by calling bpf_task_release().
* @p: The task on which a reference is being acquired.
*/
-struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)
+__bpf_kfunc struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)
{
/* For the time being this function returns NULL, as it's not currently
* possible to safely acquire a reference to a task with RCU protection
@@ -1902,7 +2003,7 @@ struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)
* be released by calling bpf_task_release().
* @pp: A pointer to a task kptr on which a reference is being acquired.
*/
-struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
+__bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
{
/* We must return NULL here until we have clarity on how to properly
* leverage RCU for ensuring a task's lifetime. See the comment above
@@ -1915,7 +2016,7 @@ struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
* bpf_task_release - Release the reference acquired on a task.
* @p: The task on which a reference is being released.
*/
-void bpf_task_release(struct task_struct *p)
+__bpf_kfunc void bpf_task_release(struct task_struct *p)
{
if (!p)
return;
@@ -1930,7 +2031,7 @@ void bpf_task_release(struct task_struct *p)
* calling bpf_cgroup_release().
* @cgrp: The cgroup on which a reference is being acquired.
*/
-struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
+__bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
{
cgroup_get(cgrp);
return cgrp;
@@ -1942,7 +2043,7 @@ struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
* be released by calling bpf_cgroup_release().
* @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired.
*/
-struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
+__bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
{
struct cgroup *cgrp;
@@ -1974,7 +2075,7 @@ struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
* drops to 0.
* @cgrp: The cgroup on which a reference is being released.
*/
-void bpf_cgroup_release(struct cgroup *cgrp)
+__bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
{
if (!cgrp)
return;
@@ -1989,7 +2090,7 @@ void bpf_cgroup_release(struct cgroup *cgrp)
* @cgrp: The cgroup for which we're performing a lookup.
* @level: The level of ancestor to look up.
*/
-struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
+__bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
{
struct cgroup *ancestor;
@@ -2008,7 +2109,7 @@ struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
* stored in a map, or released with bpf_task_release().
* @pid: The pid of the task being looked up.
*/
-struct task_struct *bpf_task_from_pid(s32 pid)
+__bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
{
struct task_struct *p;
@@ -2021,22 +2122,22 @@ struct task_struct *bpf_task_from_pid(s32 pid)
return p;
}
-void *bpf_cast_to_kern_ctx(void *obj)
+__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
{
return obj;
}
-void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k)
+__bpf_kfunc void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k)
{
return obj__ign;
}
-void bpf_rcu_read_lock(void)
+__bpf_kfunc void bpf_rcu_read_lock(void)
{
rcu_read_lock();
}
-void bpf_rcu_read_unlock(void)
+__bpf_kfunc void bpf_rcu_read_unlock(void)
{
rcu_read_unlock();
}
@@ -2057,6 +2158,10 @@ BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_rbtree_add)
+BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
+
#ifdef CONFIG_CGROUPS
BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 4f841e16779e..9948b542a470 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -122,7 +122,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
inode->i_mtime = inode->i_atime;
inode->i_ctime = inode->i_atime;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
return inode;
}
@@ -152,7 +152,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
dir->i_ctime = dir->i_mtime;
}
-static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
struct inode *inode;
@@ -382,7 +382,7 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
return simple_lookup(dir, dentry, flags);
}
-static int bpf_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *target)
{
char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
@@ -559,7 +559,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
{
struct bpf_prog *prog;
- int ret = inode_permission(&init_user_ns, inode, MAY_READ);
+ int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ);
if (ret)
return ERR_PTR(ret);
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index ebcc3dd0fa19..5fcdacbb8439 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -71,7 +71,7 @@ static int bpf_mem_cache_idx(size_t size)
if (size <= 192)
return size_index[(size - 1) / 8] - 1;
- return fls(size - 1) - 1;
+ return fls(size - 1) - 2;
}
#define NUM_CACHES 11
@@ -143,7 +143,7 @@ static void *__alloc(struct bpf_mem_cache *c, int node)
return obj;
}
- return kmalloc_node(c->unit_size, flags, node);
+ return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node);
}
static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
@@ -395,7 +395,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
unit_size = size;
#ifdef CONFIG_MEMCG_KMEM
- objcg = get_obj_cgroup_from_current();
+ if (memcg_bpf_enabled())
+ objcg = get_obj_cgroup_from_current();
#endif
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(pc, cpu);
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 190d9f9dc987..0c85e06f7ea7 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -41,7 +41,7 @@ struct bpf_offload_dev {
struct bpf_offload_netdev {
struct rhash_head l;
struct net_device *netdev;
- struct bpf_offload_dev *offdev;
+ struct bpf_offload_dev *offdev; /* NULL when bound-only */
struct list_head progs;
struct list_head maps;
struct list_head offdev_netdevs;
@@ -56,7 +56,6 @@ static const struct rhashtable_params offdevs_params = {
};
static struct rhashtable offdevs;
-static bool offdevs_inited;
static int bpf_dev_offload_check(struct net_device *netdev)
{
@@ -72,58 +71,218 @@ bpf_offload_find_netdev(struct net_device *netdev)
{
lockdep_assert_held(&bpf_devs_lock);
- if (!offdevs_inited)
- return NULL;
return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
}
-int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+static int __bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
{
struct bpf_offload_netdev *ondev;
- struct bpf_prog_offload *offload;
int err;
- if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
- attr->prog_type != BPF_PROG_TYPE_XDP)
- return -EINVAL;
+ ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
+ if (!ondev)
+ return -ENOMEM;
- if (attr->prog_flags)
- return -EINVAL;
+ ondev->netdev = netdev;
+ ondev->offdev = offdev;
+ INIT_LIST_HEAD(&ondev->progs);
+ INIT_LIST_HEAD(&ondev->maps);
+
+ err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
+ if (err) {
+ netdev_warn(netdev, "failed to register for BPF offload\n");
+ goto err_free;
+ }
+
+ if (offdev)
+ list_add(&ondev->offdev_netdevs, &offdev->netdevs);
+ return 0;
+
+err_free:
+ kfree(ondev);
+ return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_prog_offload *offload = prog->aux->offload;
+
+ if (offload->dev_state)
+ offload->offdev->ops->destroy(prog);
+
+ list_del_init(&offload->offloads);
+ kfree(offload);
+ prog->aux->offload = NULL;
+}
+
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+ enum bpf_netdev_command cmd)
+{
+ struct netdev_bpf data = {};
+ struct net_device *netdev;
+
+ ASSERT_RTNL();
+
+ data.command = cmd;
+ data.offmap = offmap;
+ /* Caller must make sure netdev is valid */
+ netdev = offmap->netdev;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+ WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+ /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+ bpf_map_free_id(&offmap->map);
+ list_del_init(&offmap->offloads);
+ offmap->netdev = NULL;
+}
+
+static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev, *altdev = NULL;
+ struct bpf_offloaded_map *offmap, *mtmp;
+ struct bpf_prog_offload *offload, *ptmp;
+
+ ASSERT_RTNL();
+
+ ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
+ if (WARN_ON(!ondev))
+ return;
+
+ WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
+
+ /* Try to move the objects to another netdev of the device */
+ if (offdev) {
+ list_del(&ondev->offdev_netdevs);
+ altdev = list_first_entry_or_null(&offdev->netdevs,
+ struct bpf_offload_netdev,
+ offdev_netdevs);
+ }
+
+ if (altdev) {
+ list_for_each_entry(offload, &ondev->progs, offloads)
+ offload->netdev = altdev->netdev;
+ list_splice_init(&ondev->progs, &altdev->progs);
+
+ list_for_each_entry(offmap, &ondev->maps, offloads)
+ offmap->netdev = altdev->netdev;
+ list_splice_init(&ondev->maps, &altdev->maps);
+ } else {
+ list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
+ __bpf_prog_offload_destroy(offload->prog);
+ list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
+ __bpf_map_offload_destroy(offmap);
+ }
+
+ WARN_ON(!list_empty(&ondev->progs));
+ WARN_ON(!list_empty(&ondev->maps));
+ kfree(ondev);
+}
+
+static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev;
+ struct bpf_prog_offload *offload;
+ int err;
offload = kzalloc(sizeof(*offload), GFP_USER);
if (!offload)
return -ENOMEM;
offload->prog = prog;
+ offload->netdev = netdev;
- offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
- attr->prog_ifindex);
- err = bpf_dev_offload_check(offload->netdev);
- if (err)
- goto err_maybe_put;
-
- down_write(&bpf_devs_lock);
ondev = bpf_offload_find_netdev(offload->netdev);
if (!ondev) {
- err = -EINVAL;
- goto err_unlock;
+ if (bpf_prog_is_offloaded(prog->aux)) {
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ /* When only binding to the device, explicitly
+ * create an entry in the hashtable.
+ */
+ err = __bpf_offload_dev_netdev_register(NULL, offload->netdev);
+ if (err)
+ goto err_free;
+ ondev = bpf_offload_find_netdev(offload->netdev);
}
offload->offdev = ondev->offdev;
prog->aux->offload = offload;
list_add_tail(&offload->offloads, &ondev->progs);
- dev_put(offload->netdev);
- up_write(&bpf_devs_lock);
return 0;
-err_unlock:
- up_write(&bpf_devs_lock);
-err_maybe_put:
- if (offload->netdev)
- dev_put(offload->netdev);
+err_free:
kfree(offload);
return err;
}
+int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ struct net_device *netdev;
+ int err;
+
+ if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_type != BPF_PROG_TYPE_XDP)
+ return -EINVAL;
+
+ if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY)
+ return -EINVAL;
+
+ if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)
+ return -EINVAL;
+
+ netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex);
+ if (!netdev)
+ return -EINVAL;
+
+ err = bpf_dev_offload_check(netdev);
+ if (err)
+ goto out;
+
+ prog->aux->offload_requested = !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY);
+
+ down_write(&bpf_devs_lock);
+ err = __bpf_prog_dev_bound_init(prog, netdev);
+ up_write(&bpf_devs_lock);
+
+out:
+ dev_put(netdev);
+ return err;
+}
+
+int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog)
+{
+ int err;
+
+ if (!bpf_prog_is_dev_bound(old_prog->aux))
+ return 0;
+
+ if (bpf_prog_is_offloaded(old_prog->aux))
+ return -EINVAL;
+
+ new_prog->aux->dev_bound = old_prog->aux->dev_bound;
+ new_prog->aux->offload_requested = old_prog->aux->offload_requested;
+
+ down_write(&bpf_devs_lock);
+ if (!old_prog->aux->offload) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = __bpf_prog_dev_bound_init(new_prog, old_prog->aux->offload->netdev);
+
+out:
+ up_write(&bpf_devs_lock);
+ return err;
+}
+
int bpf_prog_offload_verifier_prep(struct bpf_prog *prog)
{
struct bpf_prog_offload *offload;
@@ -209,24 +368,25 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
up_read(&bpf_devs_lock);
}
-static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+void bpf_prog_dev_bound_destroy(struct bpf_prog *prog)
{
- struct bpf_prog_offload *offload = prog->aux->offload;
-
- if (offload->dev_state)
- offload->offdev->ops->destroy(prog);
-
- list_del_init(&offload->offloads);
- kfree(offload);
- prog->aux->offload = NULL;
-}
+ struct bpf_offload_netdev *ondev;
+ struct net_device *netdev;
-void bpf_prog_offload_destroy(struct bpf_prog *prog)
-{
+ rtnl_lock();
down_write(&bpf_devs_lock);
- if (prog->aux->offload)
+ if (prog->aux->offload) {
+ list_del_init(&prog->aux->offload->offloads);
+
+ netdev = prog->aux->offload->netdev;
__bpf_prog_offload_destroy(prog);
+
+ ondev = bpf_offload_find_netdev(netdev);
+ if (!ondev->offdev && list_empty(&ondev->progs))
+ __bpf_offload_dev_netdev_unregister(NULL, netdev);
+ }
up_write(&bpf_devs_lock);
+ rtnl_unlock();
}
static int bpf_prog_offload_translate(struct bpf_prog *prog)
@@ -340,22 +500,6 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
const struct bpf_prog_ops bpf_offload_prog_ops = {
};
-static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
- enum bpf_netdev_command cmd)
-{
- struct netdev_bpf data = {};
- struct net_device *netdev;
-
- ASSERT_RTNL();
-
- data.command = cmd;
- data.offmap = offmap;
- /* Caller must make sure netdev is valid */
- netdev = offmap->netdev;
-
- return netdev->netdev_ops->ndo_bpf(netdev, &data);
-}
-
struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
{
struct net *net = current->nsproxy->net_ns;
@@ -405,15 +549,6 @@ err_unlock:
return ERR_PTR(err);
}
-static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
-{
- WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
- /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
- bpf_map_free_id(&offmap->map, true);
- list_del_init(&offmap->offloads);
- offmap->netdev = NULL;
-}
-
void bpf_map_offload_map_free(struct bpf_map *map)
{
struct bpf_offloaded_map *offmap = map_to_offmap(map);
@@ -573,12 +708,28 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev)
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_match);
+bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs)
+{
+ bool ret;
+
+ if (bpf_prog_is_offloaded(lhs->aux) != bpf_prog_is_offloaded(rhs->aux))
+ return false;
+
+ down_read(&bpf_devs_lock);
+ ret = lhs->aux->offload && rhs->aux->offload &&
+ lhs->aux->offload->netdev &&
+ lhs->aux->offload->netdev == rhs->aux->offload->netdev;
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
{
struct bpf_offloaded_map *offmap;
bool ret;
- if (!bpf_map_is_dev_bound(map))
+ if (!bpf_map_is_offloaded(map))
return bpf_map_offload_neutral(map);
offmap = map_to_offmap(map);
@@ -592,32 +743,11 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
struct net_device *netdev)
{
- struct bpf_offload_netdev *ondev;
int err;
- ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
- if (!ondev)
- return -ENOMEM;
-
- ondev->netdev = netdev;
- ondev->offdev = offdev;
- INIT_LIST_HEAD(&ondev->progs);
- INIT_LIST_HEAD(&ondev->maps);
-
down_write(&bpf_devs_lock);
- err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
- if (err) {
- netdev_warn(netdev, "failed to register for BPF offload\n");
- goto err_unlock_free;
- }
-
- list_add(&ondev->offdev_netdevs, &offdev->netdevs);
- up_write(&bpf_devs_lock);
- return 0;
-
-err_unlock_free:
+ err = __bpf_offload_dev_netdev_register(offdev, netdev);
up_write(&bpf_devs_lock);
- kfree(ondev);
return err;
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
@@ -625,43 +755,8 @@ EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
struct net_device *netdev)
{
- struct bpf_offload_netdev *ondev, *altdev;
- struct bpf_offloaded_map *offmap, *mtmp;
- struct bpf_prog_offload *offload, *ptmp;
-
- ASSERT_RTNL();
-
down_write(&bpf_devs_lock);
- ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
- if (WARN_ON(!ondev))
- goto unlock;
-
- WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
- list_del(&ondev->offdev_netdevs);
-
- /* Try to move the objects to another netdev of the device */
- altdev = list_first_entry_or_null(&offdev->netdevs,
- struct bpf_offload_netdev,
- offdev_netdevs);
- if (altdev) {
- list_for_each_entry(offload, &ondev->progs, offloads)
- offload->netdev = altdev->netdev;
- list_splice_init(&ondev->progs, &altdev->progs);
-
- list_for_each_entry(offmap, &ondev->maps, offloads)
- offmap->netdev = altdev->netdev;
- list_splice_init(&ondev->maps, &altdev->maps);
- } else {
- list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
- __bpf_prog_offload_destroy(offload->prog);
- list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
- __bpf_map_offload_destroy(offmap);
- }
-
- WARN_ON(!list_empty(&ondev->progs));
- WARN_ON(!list_empty(&ondev->maps));
- kfree(ondev);
-unlock:
+ __bpf_offload_dev_netdev_unregister(offdev, netdev);
up_write(&bpf_devs_lock);
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
@@ -670,18 +765,6 @@ struct bpf_offload_dev *
bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)
{
struct bpf_offload_dev *offdev;
- int err;
-
- down_write(&bpf_devs_lock);
- if (!offdevs_inited) {
- err = rhashtable_init(&offdevs, &offdevs_params);
- if (err) {
- up_write(&bpf_devs_lock);
- return ERR_PTR(err);
- }
- offdevs_inited = true;
- }
- up_write(&bpf_devs_lock);
offdev = kzalloc(sizeof(*offdev), GFP_KERNEL);
if (!offdev)
@@ -707,3 +790,67 @@ void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev)
return offdev->priv;
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_priv);
+
+void bpf_dev_bound_netdev_unregister(struct net_device *dev)
+{
+ struct bpf_offload_netdev *ondev;
+
+ ASSERT_RTNL();
+
+ down_write(&bpf_devs_lock);
+ ondev = bpf_offload_find_netdev(dev);
+ if (ondev && !ondev->offdev)
+ __bpf_offload_dev_netdev_unregister(NULL, ondev->netdev);
+ up_write(&bpf_devs_lock);
+}
+
+int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
+ struct bpf_prog_aux *prog_aux)
+{
+ if (!bpf_prog_is_dev_bound(prog_aux)) {
+ bpf_log(log, "metadata kfuncs require device-bound program\n");
+ return -EINVAL;
+ }
+
+ if (bpf_prog_is_offloaded(prog_aux)) {
+ bpf_log(log, "metadata kfuncs can't be offloaded\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
+{
+ const struct xdp_metadata_ops *ops;
+ void *p = NULL;
+
+ /* We don't hold bpf_devs_lock while resolving several
+ * kfuncs and can race with the unregister_netdevice().
+ * We rely on bpf_dev_bound_match() check at attach
+ * to render this program unusable.
+ */
+ down_read(&bpf_devs_lock);
+ if (!prog->aux->offload)
+ goto out;
+
+ ops = prog->aux->offload->netdev->xdp_metadata_ops;
+ if (!ops)
+ goto out;
+
+ if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP))
+ p = ops->xmo_rx_timestamp;
+ else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH))
+ p = ops->xmo_rx_hash;
+out:
+ up_read(&bpf_devs_lock);
+
+ return p;
+}
+
+static int __init bpf_offload_init(void)
+{
+ return rhashtable_init(&offdevs, &offdevs_params);
+}
+
+late_initcall(bpf_offload_init);
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 5106b5372f0c..b56f9f3314fd 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -3,7 +3,11 @@
#include <linux/init.h>
#include <linux/module.h>
#include "bpf_preload.h"
-#include "iterators/iterators.lskel.h"
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#include "iterators/iterators.lskel-little-endian.h"
+#else
+#include "iterators/iterators.lskel-big-endian.h"
+#endif
static struct bpf_link *maps_link, *progs_link;
static struct iterators_bpf *skel;
diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
index 6762b1260f2f..8937dc6bc8d0 100644
--- a/kernel/bpf/preload/iterators/Makefile
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -35,20 +35,22 @@ endif
.PHONY: all clean
-all: iterators.lskel.h
+all: iterators.lskel-little-endian.h
+
+big: iterators.lskel-big-endian.h
clean:
$(call msg,CLEAN)
$(Q)rm -rf $(OUTPUT) iterators
-iterators.lskel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL)
+iterators.lskel-%.h: $(OUTPUT)/%/iterators.bpf.o | $(BPFTOOL)
$(call msg,GEN-SKEL,$@)
$(Q)$(BPFTOOL) gen skeleton -L $< > $@
-
-$(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
+$(OUTPUT)/%/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
$(call msg,BPF,$@)
- $(Q)$(CLANG) -g -O2 -target bpf $(INCLUDES) \
+ $(Q)mkdir -p $(@D)
+ $(Q)$(CLANG) -g -O2 -target bpf -m$* $(INCLUDES) \
-c $(filter %.c,$^) -o $@ && \
$(LLVM_STRIP) -g $@
diff --git a/kernel/bpf/preload/iterators/README b/kernel/bpf/preload/iterators/README
index 7fd6d39a9ad2..98e7c90ea012 100644
--- a/kernel/bpf/preload/iterators/README
+++ b/kernel/bpf/preload/iterators/README
@@ -1,4 +1,7 @@
WARNING:
-If you change "iterators.bpf.c" do "make -j" in this directory to rebuild "iterators.skel.h".
+If you change "iterators.bpf.c" do "make -j" in this directory to
+rebuild "iterators.lskel-little-endian.h". Then, on a big-endian
+machine, do "make -j big" in this directory to rebuild
+"iterators.lskel-big-endian.h". Commit both resulting headers.
Make sure to have clang 10 installed.
See Documentation/bpf/bpf_devel_QA.rst
diff --git a/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h
new file mode 100644
index 000000000000..ebdc6c0cdb70
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h
@@ -0,0 +1,419 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* THIS FILE IS AUTOGENERATED BY BPFTOOL! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <bpf/skel_internal.h>
+
+struct iterators_bpf {
+ struct bpf_loader_ctx ctx;
+ struct {
+ struct bpf_map_desc rodata;
+ } maps;
+ struct {
+ struct bpf_prog_desc dump_bpf_map;
+ struct bpf_prog_desc dump_bpf_prog;
+ } progs;
+ struct {
+ int dump_bpf_map_fd;
+ int dump_bpf_prog_fd;
+ } links;
+};
+
+static inline int
+iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_map.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_map_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_prog.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_prog_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *skel)
+{
+ int ret = 0;
+
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel);
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel);
+ return ret < 0 ? ret : 0;
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *skel)
+{
+ skel_closenz(skel->links.dump_bpf_map_fd);
+ skel_closenz(skel->links.dump_bpf_prog_fd);
+}
+static void
+iterators_bpf__destroy(struct iterators_bpf *skel)
+{
+ if (!skel)
+ return;
+ iterators_bpf__detach(skel);
+ skel_closenz(skel->progs.dump_bpf_map.prog_fd);
+ skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
+ skel_closenz(skel->maps.rodata.map_fd);
+ skel_free(skel);
+}
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = skel_alloc(sizeof(*skel));
+ if (!skel)
+ goto cleanup;
+ skel->ctx.sz = (void *)&skel->links - (void *)skel;
+ return skel;
+cleanup:
+ iterators_bpf__destroy(skel);
+ return NULL;
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *skel)
+{
+ struct bpf_load_and_run_opts opts = {};
+ int err;
+
+ opts.ctx = (struct bpf_loader_ctx *)skel;
+ opts.data_sz = 6008;
+ opts.data = (void *)"\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x9f\x01\0\
+\0\0\0\x18\0\0\0\0\0\0\x04\x1c\0\0\x04\x1c\0\0\x05\x18\0\0\0\0\x02\0\0\0\0\0\0\
+\x02\0\0\0\x01\x04\0\0\x02\0\0\0\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\
+\0\x04\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x08\0\0\0\0\x02\0\0\0\0\0\0\x0d\0\0\0\
+\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\x01\0\0\0\0\0\0\x04\x01\
+\0\0\x20\0\0\0\x24\x0c\0\0\x01\0\0\0\x05\0\0\0\xc2\x04\0\0\x03\0\0\0\x18\0\0\0\
+\xd0\0\0\0\x09\0\0\0\0\0\0\0\xd4\0\0\0\x0b\0\0\0\x40\0\0\0\xdf\0\0\0\x0b\0\0\0\
+\x80\0\0\0\0\x02\0\0\0\0\0\0\x0a\0\0\0\xe7\x07\0\0\0\0\0\0\0\0\0\0\xf0\x08\0\0\
+\0\0\0\0\x0c\0\0\0\xf6\x01\0\0\0\0\0\0\x08\0\0\0\x40\0\0\x01\xb3\x04\0\0\x03\0\
+\0\0\x18\0\0\x01\xbb\0\0\0\x0e\0\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x01\
+\xc3\0\0\0\x0e\0\0\0\xa0\0\0\x01\xcf\x08\0\0\0\0\0\0\x0f\0\0\x01\xd5\x01\0\0\0\
+\0\0\0\x04\0\0\0\x20\0\0\x01\xe2\x01\0\0\0\0\0\0\x01\x01\0\0\x08\0\0\0\0\x03\0\
+\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\x01\xe7\x01\0\0\0\0\0\0\x04\0\0\
+\0\x20\0\0\0\0\x02\0\0\0\0\0\0\x14\0\0\x02\x4b\x04\0\0\x02\0\0\0\x10\0\0\0\x13\
+\0\0\0\x03\0\0\0\0\0\0\x02\x5e\0\0\0\x15\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x18\
+\0\0\0\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x13\0\0\x02\x63\x0c\0\0\x01\0\0\
+\0\x16\0\0\x02\xaf\x04\0\0\x01\0\0\0\x08\0\0\x02\xb8\0\0\0\x19\0\0\0\0\0\0\0\0\
+\x02\0\0\0\0\0\0\x1a\0\0\x03\x09\x04\0\0\x06\0\0\0\x38\0\0\x01\xbb\0\0\0\x0e\0\
+\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x03\x16\0\0\0\x1b\0\0\0\xc0\0\0\x03\
+\x27\0\0\0\x15\0\0\x01\0\0\0\x03\x30\0\0\0\x1d\0\0\x01\x40\0\0\x03\x3a\0\0\0\
+\x1e\0\0\x01\x80\0\0\0\0\x02\0\0\0\0\0\0\x1c\0\0\0\0\x0a\0\0\0\0\0\0\x10\0\0\0\
+\0\x02\0\0\0\0\0\0\x1f\0\0\0\0\x02\0\0\0\0\0\0\x20\0\0\x03\x84\x04\0\0\x02\0\0\
+\0\x08\0\0\x03\x92\0\0\0\x0e\0\0\0\0\0\0\x03\x9b\0\0\0\x0e\0\0\0\x20\0\0\x03\
+\x3a\x04\0\0\x03\0\0\0\x18\0\0\x03\xa5\0\0\0\x1b\0\0\0\0\0\0\x03\xad\0\0\0\x21\
+\0\0\0\x40\0\0\x03\xb3\0\0\0\x23\0\0\0\x80\0\0\0\0\x02\0\0\0\0\0\0\x22\0\0\0\0\
+\x02\0\0\0\0\0\0\x24\0\0\x03\xb7\x04\0\0\x01\0\0\0\x04\0\0\x03\xc2\0\0\0\x0e\0\
+\0\0\0\0\0\x04\x2b\x04\0\0\x01\0\0\0\x04\0\0\x04\x34\0\0\0\x0e\0\0\0\0\0\0\0\0\
+\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\x04\xaa\x0e\0\0\0\0\0\0\
+\x25\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\x04\
+\xbe\x0e\0\0\0\0\0\0\x27\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\
+\0\0\0\x20\0\0\x04\xd4\x0e\0\0\0\0\0\0\x29\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\
+\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\x04\xe9\x0e\0\0\0\0\0\0\x2b\0\0\0\0\0\0\0\0\
+\x03\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\x05\0\x0e\0\0\0\0\0\0\x2d\
+\0\0\0\x01\0\0\x05\x08\x0f\0\0\x04\0\0\0\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\
+\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\
+\0\x11\0\0\x05\x10\x0f\0\0\x01\0\0\0\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\x62\x70\
+\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\
+\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\
+\x30\0\x2f\x68\x6f\x6d\x65\x2f\x69\x69\x69\x2f\x6c\x69\x6e\x75\x78\x2d\x6b\x65\
+\x72\x6e\x65\x6c\x2d\x74\x6f\x6f\x6c\x63\x68\x61\x69\x6e\x2f\x73\x72\x63\x2f\
+\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\
+\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\
+\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\
+\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\
+\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\
+\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\
+\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\
+\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\
+\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\
+\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\
+\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\
+\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\
+\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\
+\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\
+\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\
+\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\
+\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\
+\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x69\
+\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\x5f\x53\x49\x5a\x45\
+\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\
+\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\
+\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\x2d\x3e\x69\x64\x2c\x20\x6d\x61\
+\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\x70\x2d\x3e\x6d\x61\x78\x5f\x65\
+\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\
+\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\
+\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\
+\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\
+\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\
+\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\
+\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\
+\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\
+\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\
+\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\
+\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\
+\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\
+\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\
+\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\
+\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
+\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\
+\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\
+\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\
+\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\
+\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\
+\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\
+\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\
+\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\
+\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\
+\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\
+\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\
+\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\
+\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\
+\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\
+\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\
+\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\
+\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\
+\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\
+\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x72\x6f\x64\
+\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x09\x4c\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x62\0\0\0\
+\x01\0\0\0\x80\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\
+\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\0\0\0\0\0\0\0\0\0\0\x20\
+\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\
+\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\
+\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\0\0\0\0\x79\x62\0\0\
+\0\0\0\0\x79\x71\0\x08\0\0\0\0\x15\x70\0\x1a\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\
+\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xe8\xbf\x16\0\0\
+\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x30\0\0\0\0\0\x23\xb7\x50\0\0\
+\0\0\0\0\x85\0\0\0\0\0\0\x7e\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xe8\0\0\0\0\xb7\
+\x10\0\0\0\0\0\x04\xbf\x27\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x7b\xa2\xff\xf0\0\0\
+\0\0\x61\x17\0\x14\0\0\0\0\x7b\xa1\xff\xf8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\
+\0\0\xff\xff\xff\xe8\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x23\
+\xb7\x30\0\0\0\0\0\x0e\xb7\x50\0\0\0\0\0\x18\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\
+\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x3c\
+\x1e\0\0\0\x01\0\0\0\x42\0\0\0\x9a\0\x01\x3c\x24\0\0\0\x02\0\0\0\x42\0\0\x01\
+\x0d\0\x01\x44\x1d\0\0\0\x03\0\0\0\x42\0\0\x01\x2e\0\x01\x4c\x06\0\0\0\x04\0\0\
+\0\x42\0\0\x01\x3d\0\x01\x40\x1d\0\0\0\x05\0\0\0\x42\0\0\x01\x62\0\x01\x58\x06\
+\0\0\0\x07\0\0\0\x42\0\0\x01\x75\0\x01\x5c\x03\0\0\0\x0e\0\0\0\x42\0\0\x01\xfb\
+\0\x01\x64\x02\0\0\0\x1e\0\0\0\x42\0\0\x02\x49\0\x01\x6c\x01\0\0\0\0\0\0\0\x02\
+\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\
+\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x70\0\0\0\x0d\
+\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\x01\x09\0\0\0\0\0\0\0\xa0\0\0\0\x0d\
+\0\0\x01\x39\0\0\0\0\0\0\0\x1a\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\
+\x6d\x61\x70\0\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\
+\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\
+\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\
+\0\0\0\0\x79\x62\0\0\0\0\0\0\x79\x11\0\x08\0\0\0\0\x15\x10\0\x3b\0\0\0\0\x79\
+\x71\0\0\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\
+\0\x07\x40\0\0\xff\xff\xff\xd0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\x31\xb7\x30\0\0\0\0\0\x20\xb7\x50\0\0\0\0\0\0\x85\0\0\0\0\0\0\x7e\x7b\
+\xa6\xff\xc8\0\0\0\0\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xd0\0\0\0\0\xb7\x30\0\0\0\
+\0\0\x04\xbf\x97\0\0\0\0\0\0\x0f\x93\0\0\0\0\0\0\x79\x17\0\x28\0\0\0\0\x79\x87\
+\0\x30\0\0\0\0\x15\x80\0\x18\0\0\0\0\xb7\x20\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\
+\x61\x11\0\x04\0\0\0\0\x79\x38\0\x08\0\0\0\0\x67\x10\0\0\0\0\0\x03\x0f\x31\0\0\
+\0\0\0\0\x79\x68\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf8\
+\xb7\x20\0\0\0\0\0\x08\x85\0\0\0\0\0\0\x71\xb7\x10\0\0\0\0\0\0\x79\x3a\xff\xf8\
+\0\0\0\0\x0f\x31\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf4\
+\xb7\x20\0\0\0\0\0\x04\x85\0\0\0\0\0\0\x71\xb7\x30\0\0\0\0\0\x04\x61\x1a\xff\
+\xf4\0\0\0\0\x61\x28\0\x10\0\0\0\0\x3d\x12\0\x02\0\0\0\0\x0f\x61\0\0\0\0\0\0\
+\xbf\x96\0\0\0\0\0\0\x7b\xa9\xff\xd8\0\0\0\0\x79\x17\0\x18\0\0\0\0\x7b\xa1\xff\
+\xe0\0\0\0\0\x79\x17\0\x20\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x13\0\0\0\0\0\0\x7b\
+\xa1\xff\xe8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\x79\x1a\
+\xff\xc8\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x51\xb7\x30\0\0\0\0\0\x11\
+\xb7\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x80\x1e\0\0\0\x01\0\0\0\
+\x42\0\0\0\x9a\0\x01\x80\x24\0\0\0\x02\0\0\0\x42\0\0\x02\x7f\0\x01\x88\x1f\0\0\
+\0\x03\0\0\0\x42\0\0\x02\xa3\0\x01\x94\x06\0\0\0\x04\0\0\0\x42\0\0\x02\xbc\0\
+\x01\xa0\x0e\0\0\0\x05\0\0\0\x42\0\0\x01\x3d\0\x01\x84\x1d\0\0\0\x06\0\0\0\x42\
+\0\0\x01\x62\0\x01\xa4\x06\0\0\0\x08\0\0\0\x42\0\0\x02\xce\0\x01\xa8\x03\0\0\0\
+\x10\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x17\0\0\0\x42\0\0\x03\x79\0\x01\
+\x04\x06\0\0\0\x1a\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x1b\0\0\0\x42\0\0\
+\x03\xca\0\x01\x10\x0f\0\0\0\x1c\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x2d\0\0\0\x1e\
+\0\0\0\x42\0\0\x04\x16\0\x01\x0c\x0d\0\0\0\x20\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\
+\x02\0\0\0\x21\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x02\0\0\0\x24\0\0\0\x42\0\0\x04\
+\x3d\0\x01\x18\x0d\0\0\0\x27\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x28\0\0\
+\0\x42\0\0\x04\x3d\0\x01\x18\x0d\0\0\0\x2b\0\0\0\x42\0\0\x04\x3d\0\x01\x18\x0d\
+\0\0\0\x2c\0\0\0\x42\0\0\x04\x6b\0\x01\x1c\x1b\0\0\0\x2d\0\0\0\x42\0\0\x04\x6b\
+\0\x01\x1c\x06\0\0\0\x2e\0\0\0\x42\0\0\x04\x8e\0\x01\x24\x0d\0\0\0\x30\0\0\0\
+\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x3f\0\0\0\x42\0\0\x02\x49\0\x01\xc0\x01\0\
+\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\
+\x10\0\0\0\x14\0\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\
+\x28\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x80\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\
+\x90\0\0\0\x1a\0\0\x01\x09\0\0\0\0\0\0\0\xa8\0\0\0\x1a\0\0\x03\x71\0\0\0\0\0\0\
+\0\xb0\0\0\0\x1a\0\0\x03\x75\0\0\0\0\0\0\0\xc0\0\0\0\x1f\0\0\x03\xa3\0\0\0\0\0\
+\0\0\xd8\0\0\0\x20\0\0\x01\x09\0\0\0\0\0\0\0\xf0\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\
+\0\x01\x18\0\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\x01\x50\0\0\0\x1a\0\0\x01\x09\0\0\0\
+\0\0\0\x01\x60\0\0\0\x20\0\0\x04\x65\0\0\0\0\0\0\x01\x88\0\0\0\x1a\0\0\x01\x39\
+\0\0\0\0\0\0\x01\x98\0\0\0\x1a\0\0\x04\xa6\0\0\0\0\0\0\x01\xa0\0\0\0\x18\0\0\0\
+\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\
+\x6f\x67\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\
+\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\x12\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0";
+ opts.insns_sz = 2216;
+ opts.insns = (void *)"\
+\xbf\x61\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\x78\xb7\x20\0\
+\0\0\0\0\x88\xb7\x30\0\0\0\0\0\0\x85\0\0\0\0\0\0\x71\x05\0\0\x14\0\0\0\0\x61\
+\x1a\xff\x78\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x7c\
+\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x80\0\0\0\0\xd5\
+\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x84\0\0\0\0\xd5\x10\0\x01\0\
+\0\0\0\x85\0\0\0\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\
+\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xbf\x07\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\
+\0\x0e\x68\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\
+\0\0\0\x0e\x64\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x0e\x58\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x05\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
+\x12\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\xb7\x30\0\0\0\0\0\x1c\x85\0\0\0\0\
+\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\xd4\0\0\0\0\x63\xa7\xff\x78\0\0\0\0\
+\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xa0\x63\x10\0\0\0\
+\0\0\0\x61\x06\0\x1c\0\0\0\0\x15\0\0\x03\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\
+\0\x0e\x7c\x63\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\
+\0\0\x0e\x70\xb7\x30\0\0\0\0\0\x48\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\
+\x70\xff\xc3\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x17\0\0\0\0\0\0\
+\x79\x36\0\x20\0\0\0\0\x15\x30\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\
+\x0e\xb8\xb7\x20\0\0\0\0\0\x62\x61\x06\0\x04\0\0\0\0\x45\0\0\x02\0\0\0\x01\x85\
+\0\0\0\0\0\0\x94\x05\0\0\x01\0\0\0\0\x85\0\0\0\0\0\0\x71\x18\x26\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\x63\
+\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x20\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x0f\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xb8\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x38\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
+\x02\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\xb7\x30\0\0\0\0\0\x20\x85\0\0\0\0\
+\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x9f\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\x63\x10\
+\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\x16\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\xb7\
+\x30\0\0\0\0\0\x04\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x92\0\0\
+\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x50\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\
+\x11\x70\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x58\x18\x16\0\
+\0\0\0\0\0\0\0\0\0\0\0\x11\x68\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\
+\0\0\x10\x58\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xb0\x7b\x10\0\0\0\0\0\0\x18\
+\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\x60\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xc0\
+\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\xf0\x18\x16\0\0\0\0\0\
+\0\0\0\0\0\0\0\x11\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd8\x7b\x10\0\0\0\0\0\0\x61\x06\0\x08\0\0\
+\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x78\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\
+\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x7c\x63\x10\0\0\0\0\0\0\x79\x06\0\
+\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x80\x7b\x10\0\0\0\0\0\0\x61\
+\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xa8\x63\x10\0\0\0\0\0\
+\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xf0\xb7\x20\0\0\0\0\0\x11\xb7\x30\0\0\0\
+\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\
+\xff\x5c\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\x60\x63\x07\0\x6c\0\0\0\0\
+\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\0\0\0\x05\x18\x26\0\0\
+\0\0\0\0\0\0\0\0\0\0\x11\x60\xb7\x30\0\0\0\0\0\x8c\x85\0\0\0\0\0\0\xa6\xbf\x70\
+\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd0\x61\x10\0\0\0\0\0\0\xd5\
+\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\xff\x4a\0\0\
+\0\0\x63\xa7\xff\x80\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x08\x18\x16\0\
+\0\0\0\0\0\0\0\0\0\0\0\x16\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\
+\0\0\x12\x10\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd8\x7b\x10\0\0\0\0\0\0\x18\
+\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x18\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x20\
+\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x20\x18\x16\0\0\0\0\0\
+\0\0\0\0\0\0\0\x17\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x15\
+\xb0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x50\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x48\x7b\x10\0\0\0\0\
+\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xe8\x63\x10\0\0\
+\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xec\x63\x10\
+\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xf0\x7b\
+\x10\0\0\0\0\0\0\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\
+\x18\x63\x10\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x60\xb7\x20\0\0\0\
+\0\0\x12\xb7\x30\0\0\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\
+\0\0\0\0\0\0\xc5\x70\xff\x13\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\
+\x63\x07\0\x6c\0\0\0\0\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\
+\0\0\0\x05\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\xb7\x30\0\0\0\0\0\x8c\x85\0\
+\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\x40\x61\
+\x10\0\0\0\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\
+\xc5\x70\xff\x01\0\0\0\0\x63\xa7\xff\x84\0\0\0\0\x61\x1a\xff\x78\0\0\0\0\xd5\
+\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x0a\xff\x80\0\0\
+\0\0\x63\x60\0\x28\0\0\0\0\x61\x0a\xff\x84\0\0\0\0\x63\x60\0\x2c\0\0\0\0\x18\
+\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\0\0\x63\x60\0\x18\0\0\0\0\xb7\
+\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0";
+ err = bpf_load_and_run(&opts);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = iterators_bpf__open();
+ if (!skel)
+ return NULL;
+ if (iterators_bpf__load(skel)) {
+ iterators_bpf__destroy(skel);
+ return NULL;
+ }
+ return skel;
+}
+
+__attribute__((unused)) static void
+iterators_bpf__assert(struct iterators_bpf *s __attribute__((unused)))
+{
+#ifdef __cplusplus
+#define _Static_assert static_assert
+#endif
+#ifdef __cplusplus
+#undef _Static_assert
+#endif
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/preload/iterators/iterators.lskel.h b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
index 70f236a82fe1..70f236a82fe1 100644
--- a/kernel/bpf/preload/iterators/iterators.lskel.h
+++ b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 80f4b4d88aaf..8732e0aadf36 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -269,7 +269,7 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma
if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
return -EPERM;
} else {
- vma->vm_flags &= ~VM_MAYWRITE;
+ vm_flags_clear(vma, VM_MAYWRITE);
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb,
@@ -290,7 +290,7 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma
*/
return -EPERM;
} else {
- vma->vm_flags &= ~VM_MAYWRITE;
+ vm_flags_clear(vma, VM_MAYWRITE);
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ecca9366c7a6..adc83cb82f37 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -181,7 +181,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
int err;
/* Need to create a kthread, thus must support schedule */
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
return bpf_map_offload_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
@@ -238,7 +238,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
void *ptr;
int err;
- if (bpf_map_is_dev_bound(map))
+ if (bpf_map_is_offloaded(map))
return bpf_map_offload_lookup_elem(map, key, value);
bpf_disable_instrumentation();
@@ -309,7 +309,7 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
* __GFP_RETRY_MAYFAIL to avoid such situations.
*/
- const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT;
+ gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO);
unsigned int flags = 0;
unsigned long align = 1;
void *area;
@@ -390,7 +390,7 @@ static int bpf_map_alloc_id(struct bpf_map *map)
return id > 0 ? 0 : id;
}
-void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_free_id(struct bpf_map *map)
{
unsigned long flags;
@@ -402,18 +402,12 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
if (!map->id)
return;
- if (do_idr_lock)
- spin_lock_irqsave(&map_idr_lock, flags);
- else
- __acquire(&map_idr_lock);
+ spin_lock_irqsave(&map_idr_lock, flags);
idr_remove(&map_idr, map->id);
map->id = 0;
- if (do_idr_lock)
- spin_unlock_irqrestore(&map_idr_lock, flags);
- else
- __release(&map_idr_lock);
+ spin_unlock_irqrestore(&map_idr_lock, flags);
}
#ifdef CONFIG_MEMCG_KMEM
@@ -424,7 +418,8 @@ static void bpf_map_save_memcg(struct bpf_map *map)
* So we have to check map->objcg for being NULL each time it's
* being used.
*/
- map->objcg = get_obj_cgroup_from_current();
+ if (memcg_bpf_enabled())
+ map->objcg = get_obj_cgroup_from_current();
}
static void bpf_map_release_memcg(struct bpf_map *map)
@@ -470,6 +465,21 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
return ptr;
}
+void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
+ gfp_t flags)
+{
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
+}
+
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
size_t align, gfp_t flags)
{
@@ -527,9 +537,6 @@ void btf_record_free(struct btf_record *rec)
return;
for (i = 0; i < rec->cnt; i++) {
switch (rec->fields[i].type) {
- case BPF_SPIN_LOCK:
- case BPF_TIMER:
- break;
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
if (rec->fields[i].kptr.module)
@@ -538,7 +545,11 @@ void btf_record_free(struct btf_record *rec)
break;
case BPF_LIST_HEAD:
case BPF_LIST_NODE:
- /* Nothing to release for bpf_list_head */
+ case BPF_RB_ROOT:
+ case BPF_RB_NODE:
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ /* Nothing to release */
break;
default:
WARN_ON_ONCE(1);
@@ -571,9 +582,6 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
new_rec->cnt = 0;
for (i = 0; i < rec->cnt; i++) {
switch (fields[i].type) {
- case BPF_SPIN_LOCK:
- case BPF_TIMER:
- break;
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
btf_get(fields[i].kptr.btf);
@@ -584,7 +592,11 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
break;
case BPF_LIST_HEAD:
case BPF_LIST_NODE:
- /* Nothing to acquire for bpf_list_head */
+ case BPF_RB_ROOT:
+ case BPF_RB_NODE:
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ /* Nothing to acquire */
break;
default:
ret = -EFAULT;
@@ -664,7 +676,13 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
continue;
bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
break;
+ case BPF_RB_ROOT:
+ if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+ continue;
+ bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off);
+ break;
case BPF_LIST_NODE:
+ case BPF_RB_NODE:
break;
default:
WARN_ON_ONCE(1);
@@ -706,13 +724,13 @@ static void bpf_map_put_uref(struct bpf_map *map)
}
/* decrement map refcnt and schedule it for freeing via workqueue
- * (unrelying map implementation ops->map_free() might sleep)
+ * (underlying map implementation ops->map_free() might sleep)
*/
-static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_put(struct bpf_map *map)
{
if (atomic64_dec_and_test(&map->refcnt)) {
/* bpf_map_free_id() must be called first */
- bpf_map_free_id(map, do_idr_lock);
+ bpf_map_free_id(map);
btf_put(map->btf);
INIT_WORK(&map->work, bpf_map_free_deferred);
/* Avoid spawning kworkers, since they all might contend
@@ -721,11 +739,6 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
queue_work(system_unbound_wq, &map->work);
}
}
-
-void bpf_map_put(struct bpf_map *map)
-{
- __bpf_map_put(map, true);
-}
EXPORT_SYMBOL_GPL(bpf_map_put);
void bpf_map_put_with_uref(struct bpf_map *map)
@@ -882,10 +895,10 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
/* set default open/close callbacks */
vma->vm_ops = &bpf_map_default_vmops;
vma->vm_private_data = map;
- vma->vm_flags &= ~VM_MAYEXEC;
+ vm_flags_clear(vma, VM_MAYEXEC);
if (!(vma->vm_flags & VM_WRITE))
/* disallow re-mapping with PROT_WRITE */
- vma->vm_flags &= ~VM_MAYWRITE;
+ vm_flags_clear(vma, VM_MAYWRITE);
err = map->ops->map_mmap(map, vma);
if (err)
@@ -1005,7 +1018,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
return -EINVAL;
map->record = btf_parse_fields(btf, value_type,
- BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD,
+ BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
+ BPF_RB_ROOT,
map->value_size);
if (!IS_ERR_OR_NULL(map->record)) {
int i;
@@ -1053,6 +1067,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
}
break;
case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY) {
@@ -1483,7 +1498,7 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
goto err_put;
}
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
} else if (IS_FD_PROG_ARRAY(map) ||
@@ -1547,7 +1562,7 @@ static int map_get_next_key(union bpf_attr *attr)
if (!next_key)
goto free_key;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_get_next_key(map, key, next_key);
goto out;
}
@@ -1605,7 +1620,7 @@ int generic_map_delete_batch(struct bpf_map *map,
map->key_size))
break;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
break;
}
@@ -1851,7 +1866,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- if (!bpf_map_is_dev_bound(map)) {
+ if (!bpf_map_is_offloaded(map)) {
bpf_disable_instrumentation();
rcu_read_lock();
err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
@@ -1944,7 +1959,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
if (!ops)
return -EINVAL;
- if (!bpf_prog_is_dev_bound(prog->aux))
+ if (!bpf_prog_is_offloaded(prog->aux))
prog->aux->ops = ops;
else
prog->aux->ops = &bpf_offload_prog_ops;
@@ -2245,7 +2260,7 @@ bool bpf_prog_get_ok(struct bpf_prog *prog,
if (prog->type != *attach_type)
return false;
- if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
+ if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)
return false;
return true;
@@ -2481,7 +2496,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
BPF_F_TEST_STATE_FREQ |
BPF_F_SLEEPABLE |
BPF_F_TEST_RND_HI32 |
- BPF_F_XDP_HAS_FRAGS))
+ BPF_F_XDP_HAS_FRAGS |
+ BPF_F_XDP_DEV_BOUND_ONLY))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2565,7 +2581,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
prog->aux->attach_btf = attach_btf;
prog->aux->attach_btf_id = attr->attach_btf_id;
prog->aux->dst_prog = dst_prog;
- prog->aux->offload_requested = !!attr->prog_ifindex;
+ prog->aux->dev_bound = !!attr->prog_ifindex;
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
@@ -2589,7 +2605,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
prog->gpl_compatible = is_gpl ? 1 : 0;
if (bpf_prog_is_dev_bound(prog->aux)) {
- err = bpf_prog_offload_init(prog, attr);
+ err = bpf_prog_dev_bound_init(prog, attr);
+ if (err)
+ goto free_prog_sec;
+ }
+
+ if (type == BPF_PROG_TYPE_EXT && dst_prog &&
+ bpf_prog_is_dev_bound(dst_prog->aux)) {
+ err = bpf_prog_dev_bound_inherit(prog, dst_prog);
if (err)
goto free_prog_sec;
}
@@ -3987,7 +4010,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
return -EFAULT;
}
- if (bpf_prog_is_dev_bound(prog->aux)) {
+ if (bpf_prog_is_offloaded(prog->aux)) {
err = bpf_prog_offload_info_fill(&info, prog);
if (err)
return err;
@@ -4215,7 +4238,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
}
info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_info_fill(&info, map);
if (err)
return err;
@@ -5309,7 +5332,6 @@ static struct ctl_table bpf_syscall_table[] = {
{
.procname = "bpf_stats_enabled",
.data = &bpf_stats_enabled_key.key,
- .maxlen = sizeof(bpf_stats_enabled_key),
.mode = 0644,
.proc_handler = bpf_stats_handler,
},
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index dbef0b0967ae..272563a0b770 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -190,6 +190,10 @@ struct bpf_verifier_stack_elem {
static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
+static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
+static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
+static int ref_set_non_owning(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
@@ -255,6 +259,7 @@ struct bpf_call_arg_meta {
int mem_size;
u64 msize_max_value;
int ref_obj_id;
+ int dynptr_id;
int map_uid;
int func_id;
struct btf *btf;
@@ -456,6 +461,11 @@ static bool type_is_ptr_alloc_obj(u32 type)
return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
}
+static bool type_is_non_owning_ref(u32 type)
+{
+ return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
+}
+
static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
{
struct btf_record *rec = NULL;
@@ -638,31 +648,57 @@ static void print_liveness(struct bpf_verifier_env *env,
verbose(env, "D");
}
-static int get_spi(s32 off)
+static int __get_spi(s32 off)
{
return (-off - 1) / BPF_REG_SIZE;
}
+static struct bpf_func_state *func(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+
+ return cur->frame[reg->frameno];
+}
+
static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
{
- int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
+ int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
- /* We need to check that slots between [spi - nr_slots + 1, spi] are
- * within [0, allocated_stack).
- *
- * Please note that the spi grows downwards. For example, a dynptr
- * takes the size of two stack slots; the first slot will be at
- * spi and the second slot will be at spi - 1.
- */
- return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
+ /* We need to check that slots between [spi - nr_slots + 1, spi] are
+ * within [0, allocated_stack).
+ *
+ * Please note that the spi grows downwards. For example, a dynptr
+ * takes the size of two stack slots; the first slot will be at
+ * spi and the second slot will be at spi - 1.
+ */
+ return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
}
-static struct bpf_func_state *func(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg)
+static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_verifier_state *cur = env->cur_state;
+ int off, spi;
- return cur->frame[reg->frameno];
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "dynptr has to be at a constant offset\n");
+ return -EINVAL;
+ }
+
+ off = reg->off + reg->var_off.value;
+ if (off % BPF_REG_SIZE) {
+ verbose(env, "cannot pass in dynptr at an offset=%d\n", off);
+ return -EINVAL;
+ }
+
+ spi = __get_spi(off);
+ if (spi < 1) {
+ verbose(env, "cannot pass in dynptr at an offset=%d\n", off);
+ return -EINVAL;
+ }
+
+ if (!is_spi_bounds_valid(func(env, reg), spi, BPF_DYNPTR_NR_SLOTS))
+ return -ERANGE;
+ return spi;
}
static const char *kernel_type_name(const struct btf* btf, u32 id)
@@ -727,37 +763,58 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
static void __mark_dynptr_reg(struct bpf_reg_state *reg,
enum bpf_dynptr_type type,
- bool first_slot);
+ bool first_slot, int dynptr_id);
static void __mark_reg_not_init(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg);
-static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1,
+static void mark_dynptr_stack_regs(struct bpf_verifier_env *env,
+ struct bpf_reg_state *sreg1,
struct bpf_reg_state *sreg2,
enum bpf_dynptr_type type)
{
- __mark_dynptr_reg(sreg1, type, true);
- __mark_dynptr_reg(sreg2, type, false);
+ int id = ++env->id_gen;
+
+ __mark_dynptr_reg(sreg1, type, true, id);
+ __mark_dynptr_reg(sreg2, type, false, id);
}
-static void mark_dynptr_cb_reg(struct bpf_reg_state *reg,
+static void mark_dynptr_cb_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
enum bpf_dynptr_type type)
{
- __mark_dynptr_reg(reg, type, true);
+ __mark_dynptr_reg(reg, type, true, ++env->id_gen);
}
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int spi);
static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
enum bpf_arg_type arg_type, int insn_idx)
{
struct bpf_func_state *state = func(env, reg);
enum bpf_dynptr_type type;
- int spi, i, id;
-
- spi = get_spi(reg->off);
-
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
- return -EINVAL;
+ int spi, i, id, err;
+
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ /* We cannot assume both spi and spi - 1 belong to the same dynptr,
+ * hence we need to call destroy_if_dynptr_stack_slot twice for both,
+ * to ensure that for the following example:
+ * [d1][d1][d2][d2]
+ * spi 3 2 1 0
+ * So marking spi = 2 should lead to destruction of both d1 and d2. In
+ * case they do belong to same dynptr, second call won't see slot_type
+ * as STACK_DYNPTR and will simply skip destruction.
+ */
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+ err = destroy_if_dynptr_stack_slot(env, state, spi - 1);
+ if (err)
+ return err;
for (i = 0; i < BPF_REG_SIZE; i++) {
state->stack[spi].slot_type[i] = STACK_DYNPTR;
@@ -768,7 +825,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
if (type == BPF_DYNPTR_TYPE_INVALID)
return -EINVAL;
- mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr,
+ mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,
&state->stack[spi - 1].spilled_ptr, type);
if (dynptr_type_refcounted(type)) {
@@ -781,6 +838,9 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
}
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
return 0;
}
@@ -789,10 +849,9 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
struct bpf_func_state *state = func(env, reg);
int spi, i;
- spi = get_spi(reg->off);
-
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
- return -EINVAL;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
for (i = 0; i < BPF_REG_SIZE; i++) {
state->stack[spi].slot_type[i] = STACK_INVALID;
@@ -805,43 +864,133 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+ /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot?
+ *
+ * While we don't allow reading STACK_INVALID, it is still possible to
+ * do <8 byte writes marking some but not all slots as STACK_MISC. Then,
+ * helpers or insns can do partial read of that part without failing,
+ * but check_stack_range_initialized, check_stack_read_var_off, and
+ * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of
+ * the slot conservatively. Hence we need to prevent those liveness
+ * marking walks.
+ *
+ * This was not a problem before because STACK_INVALID is only set by
+ * default (where the default reg state has its reg->parent as NULL), or
+ * in clean_live_states after REG_LIVE_DONE (at which point
+ * mark_reg_read won't walk reg->parent chain), but not randomly during
+ * verifier state exploration (like we did above). Hence, for our case
+ * parentage chain will still be live (i.e. reg->parent may be
+ * non-NULL), while earlier reg->parent was NULL, so we need
+ * REG_LIVE_WRITTEN to screen off read marker propagation when it is
+ * done later on reads or by mark_dynptr_read as well to unnecessary
+ * mark registers in verifier state.
+ */
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
return 0;
}
-static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int spi)
{
- struct bpf_func_state *state = func(env, reg);
- int spi, i;
+ struct bpf_func_state *fstate;
+ struct bpf_reg_state *dreg;
+ int i, dynptr_id;
- if (reg->type == CONST_PTR_TO_DYNPTR)
- return false;
+ /* We always ensure that STACK_DYNPTR is never set partially,
+ * hence just checking for slot_type[0] is enough. This is
+ * different for STACK_SPILL, where it may be only set for
+ * 1 byte, so code has to use is_spilled_reg.
+ */
+ if (state->stack[spi].slot_type[0] != STACK_DYNPTR)
+ return 0;
- spi = get_spi(reg->off);
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
- return true;
+ /* Reposition spi to first slot */
+ if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
+ spi = spi + 1;
+ if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+ verbose(env, "cannot overwrite referenced dynptr\n");
+ return -EINVAL;
+ }
+
+ mark_stack_slot_scratched(env, spi);
+ mark_stack_slot_scratched(env, spi - 1);
+
+ /* Writing partially to one dynptr stack slot destroys both. */
for (i = 0; i < BPF_REG_SIZE; i++) {
- if (state->stack[spi].slot_type[i] == STACK_DYNPTR ||
- state->stack[spi - 1].slot_type[i] == STACK_DYNPTR)
- return false;
+ state->stack[spi].slot_type[i] = STACK_INVALID;
+ state->stack[spi - 1].slot_type[i] = STACK_INVALID;
}
+ dynptr_id = state->stack[spi].spilled_ptr.id;
+ /* Invalidate any slices associated with this dynptr */
+ bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({
+ /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */
+ if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)
+ continue;
+ if (dreg->dynptr_id == dynptr_id) {
+ if (!env->allow_ptr_leaks)
+ __mark_reg_not_init(env, dreg);
+ else
+ __mark_reg_unknown(env, dreg);
+ }
+ }));
+
+ /* Do not release reference state, we are destroying dynptr on stack,
+ * not using some helper to release it. Just reset register.
+ */
+ __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+ /* Same reason as unmark_stack_slots_dynptr above */
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
+ return 0;
+}
+
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi)
+{
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return false;
+
+ /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+ * will do check_mem_access to check and update stack bounds later, so
+ * return true for that case.
+ */
+ if (spi < 0)
+ return spi == -ERANGE;
+ /* We allow overwriting existing unreferenced STACK_DYNPTR slots, see
+ * mark_stack_slots_dynptr which calls destroy_if_dynptr_stack_slot to
+ * ensure dynptr objects at the slots we are touching are completely
+ * destructed before we reinitialize them for a new one. For referenced
+ * ones, destroy_if_dynptr_stack_slot returns an error early instead of
+ * delaying it until the end where the user will get "Unreleased
+ * reference" error.
+ */
return true;
}
-static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi)
{
struct bpf_func_state *state = func(env, reg);
- int spi;
int i;
/* This already represents first slot of initialized bpf_dynptr */
if (reg->type == CONST_PTR_TO_DYNPTR)
return true;
- spi = get_spi(reg->off);
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
- !state->stack[spi].spilled_ptr.dynptr.first_slot)
+ if (spi < 0)
+ return false;
+ if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
return false;
for (i = 0; i < BPF_REG_SIZE; i++) {
@@ -868,7 +1017,9 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
if (reg->type == CONST_PTR_TO_DYNPTR) {
return reg->dynptr.type == dynptr_type;
} else {
- spi = get_spi(reg->off);
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return false;
return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
}
}
@@ -931,6 +1082,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
verbose_a("id=%d", reg->id);
if (reg->ref_obj_id)
verbose_a("ref_obj_id=%d", reg->ref_obj_id);
+ if (type_is_non_owning_ref(reg->type))
+ verbose_a("%s", "non_own_ref");
if (t != SCALAR_VALUE)
verbose_a("off=%d", reg->off);
if (type_is_pkt_pointer(t))
@@ -1404,9 +1557,11 @@ static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
*/
static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
- /* Clear id, off, and union(map_ptr, range) */
+ /* Clear off and union(map_ptr, range) */
memset(((u8 *)reg) + sizeof(reg->type), 0,
offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
+ reg->id = 0;
+ reg->ref_obj_id = 0;
___mark_reg_known(reg, imm);
}
@@ -1447,7 +1602,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
}
static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
- bool first_slot)
+ bool first_slot, int dynptr_id)
{
/* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
* callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
@@ -1455,6 +1610,8 @@ static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type ty
*/
__mark_reg_known_zero(reg);
reg->type = CONST_PTR_TO_DYNPTR;
+ /* Give each dynptr a unique id to uniquely associate slices to it. */
+ reg->id = dynptr_id;
reg->dynptr.type = type;
reg->dynptr.first_slot = first_slot;
}
@@ -1486,6 +1643,16 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
reg->type &= ~PTR_MAYBE_NULL;
}
+static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno,
+ struct btf_field_graph_root *ds_head)
+{
+ __mark_reg_known_zero(&regs[regno]);
+ regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[regno].btf = ds_head->btf;
+ regs[regno].btf_id = ds_head->value_btf_id;
+ regs[regno].off = ds_head->node_offset;
+}
+
static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
{
return type_is_pkt_pointer(reg->type);
@@ -1752,11 +1919,13 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
{
/*
- * Clear type, id, off, and union(map_ptr, range) and
+ * Clear type, off, and union(map_ptr, range) and
* padding between 'type' and union
*/
memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
reg->type = SCALAR_VALUE;
+ reg->id = 0;
+ reg->ref_obj_id = 0;
reg->var_off = tnum_unknown;
reg->frameno = 0;
reg->precise = !env->bpf_capable;
@@ -2185,6 +2354,12 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return -EINVAL;
}
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ err = bpf_dev_bound_kfunc_check(&env->log, prog_aux);
+ if (err)
+ return err;
+ }
+
desc = &tab->descs[tab->nr_descs++];
desc->func_id = func_id;
desc->imm = call_imm;
@@ -2386,6 +2561,32 @@ static int mark_reg_read(struct bpf_verifier_env *env,
return 0;
}
+static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, ret;
+
+ /* For CONST_PTR_TO_DYNPTR, it must have already been done by
+ * check_reg_arg in check_helper_call and mark_btf_func_reg_size in
+ * check_kfunc_call.
+ */
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return 0;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ /* Caller ensures dynptr is valid and initialized, which means spi is in
+ * bounds and spi is the first dynptr slot. Simply mark stack slot as
+ * read.
+ */
+ ret = mark_reg_read(env, &state->stack[spi].spilled_ptr,
+ state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64);
+ if (ret)
+ return ret;
+ return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr,
+ state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
+}
+
/* This function is supposed to be used by the following 32-bit optimization
* code only. It returns TRUE if the source or destination register operates
* on 64-bit, otherwise return FALSE.
@@ -3243,13 +3444,24 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
return reg->type != SCALAR_VALUE;
}
+/* Copy src state preserving dst->parent and dst->live fields */
+static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
+{
+ struct bpf_reg_state *parent = dst->parent;
+ enum bpf_reg_liveness live = dst->live;
+
+ *dst = *src;
+ dst->parent = parent;
+ dst->live = live;
+}
+
static void save_register_state(struct bpf_func_state *state,
int spi, struct bpf_reg_state *reg,
int size)
{
int i;
- state->stack[spi].spilled_ptr = *reg;
+ copy_register_state(&state->stack[spi].spilled_ptr, reg);
if (size == BPF_REG_SIZE)
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
@@ -3261,6 +3473,11 @@ static void save_register_state(struct bpf_func_state *state,
scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]);
}
+static bool is_bpf_st_mem(struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM;
+}
+
/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
@@ -3272,8 +3489,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
{
struct bpf_func_state *cur; /* state of the current function */
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
- u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg;
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
struct bpf_reg_state *reg = NULL;
+ u32 dst_reg = insn->dst_reg;
err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE));
if (err)
@@ -3307,6 +3525,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
}
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+
mark_stack_slot_scratched(env, spi);
if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
!register_is_null(reg) && env->bpf_capable) {
@@ -3322,6 +3544,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
return err;
}
save_register_state(state, spi, reg, size);
+ } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
+ insn->imm != 0 && env->bpf_capable) {
+ struct bpf_reg_state fake_reg = {};
+
+ __mark_reg_known(&fake_reg, (u32)insn->imm);
+ fake_reg.type = SCALAR_VALUE;
+ save_register_state(state, spi, &fake_reg, size);
} else if (reg && is_spillable_regtype(reg->type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
@@ -3356,7 +3585,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
/* when we zero initialize stack slots mark them as such */
- if (reg && register_is_null(reg)) {
+ if ((reg && register_is_null(reg)) ||
+ (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
/* backtracking doesn't work for STACK_ZERO yet. */
err = mark_chain_precision(env, value_regno);
if (err)
@@ -3401,6 +3631,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
int min_off, max_off;
int i, err;
struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
bool writing_zero = false;
/* set if the fact that we're writing a zero is used to let any
* stack slots remain STACK_ZERO
@@ -3413,13 +3644,22 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
max_off = ptr_reg->smax_value + off + size;
if (value_regno >= 0)
value_reg = &cur->regs[value_regno];
- if (value_reg && register_is_null(value_reg))
+ if ((value_reg && register_is_null(value_reg)) ||
+ (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
writing_zero = true;
err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE));
if (err)
return err;
+ for (i = min_off; i < max_off; i++) {
+ int spi;
+
+ spi = __get_spi(i);
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+ }
/* Variable offset writes destroy any spilled pointers in range. */
for (i = min_off; i < max_off; i++) {
@@ -3577,7 +3817,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
*/
s32 subreg_def = state->regs[dst_regno].subreg_def;
- state->regs[dst_regno] = *reg;
+ copy_register_state(&state->regs[dst_regno], reg);
state->regs[dst_regno].subreg_def = subreg_def;
} else {
for (i = 0; i < size; i++) {
@@ -3598,7 +3838,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
if (dst_regno >= 0) {
/* restore register state from stack */
- state->regs[dst_regno] = *reg;
+ copy_register_state(&state->regs[dst_regno], reg);
/* mark reg as written since spilled pointer state likely
* has its liveness marks cleared by is_state_visited()
* which resets stack/reg liveness for state transitions
@@ -4759,6 +4999,25 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
return 0;
}
+#define BTF_TYPE_SAFE_NESTED(__type) __PASTE(__type, __safe_fields)
+
+BTF_TYPE_SAFE_NESTED(struct task_struct) {
+ const cpumask_t *cpus_ptr;
+};
+
+static bool nested_ptr_is_trusted(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ int off)
+{
+ /* If its parent is not trusted, it can't regain its trusted status. */
+ if (!is_trusted_reg(reg))
+ return false;
+
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct task_struct));
+
+ return btf_nested_type_is_trusted(&env->log, reg, off);
+}
+
static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *regs,
int regno, int off, int size,
@@ -4830,7 +5089,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
- if (type_is_alloc(reg->type) && !reg->ref_obj_id) {
+ if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
+ !reg->ref_obj_id) {
verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
return -EFAULT;
}
@@ -4847,10 +5107,17 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
if (type_flag(reg->type) & PTR_UNTRUSTED)
flag |= PTR_UNTRUSTED;
- /* By default any pointer obtained from walking a trusted pointer is
- * no longer trusted except the rcu case below.
+ /* By default any pointer obtained from walking a trusted pointer is no
+ * longer trusted, unless the field being accessed has explicitly been
+ * marked as inheriting its parent's state of trust.
+ *
+ * An RCU-protected pointer can also be deemed trusted if we are in an
+ * RCU read region. This case is handled below.
*/
- flag &= ~PTR_TRUSTED;
+ if (nested_ptr_is_trusted(env, reg, off))
+ flag |= PTR_TRUSTED;
+ else
+ flag &= ~PTR_TRUSTED;
if (flag & MEM_RCU) {
/* Mark value register as MEM_RCU only if it is protected by
@@ -5447,6 +5714,31 @@ static int check_stack_range_initialized(
}
if (meta && meta->raw_mode) {
+ /* Ensure we won't be overwriting dynptrs when simulating byte
+ * by byte access in check_helper_call using meta.access_size.
+ * This would be a problem if we have a helper in the future
+ * which takes:
+ *
+ * helper(uninit_mem, len, dynptr)
+ *
+ * Now, uninint_mem may overlap with dynptr pointer. Hence, it
+ * may end up writing to dynptr itself when touching memory from
+ * arg 1. This can be relaxed on a case by case basis for known
+ * safe cases, but reject due to the possibilitiy of aliasing by
+ * default.
+ */
+ for (i = min_off; i < max_off + access_size; i++) {
+ int stack_off = -i - 1;
+
+ spi = __get_spi(i);
+ /* raw_mode may write past allocated_stack */
+ if (state->allocated_stack <= stack_off)
+ continue;
+ if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) {
+ verbose(env, "potential write to dynptr at off=%d disallowed\n", i);
+ return -EACCES;
+ }
+ }
meta->access_size = access_size;
meta->regno = regno;
return 0;
@@ -5788,9 +6080,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
cur->active_lock.ptr = btf;
cur->active_lock.id = reg->id;
} else {
- struct bpf_func_state *fstate = cur_func(env);
void *ptr;
- int i;
if (map)
ptr = map;
@@ -5806,25 +6096,11 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
verbose(env, "bpf_spin_unlock of different lock\n");
return -EINVAL;
}
- cur->active_lock.ptr = NULL;
- cur->active_lock.id = 0;
- for (i = fstate->acquired_refs - 1; i >= 0; i--) {
- int err;
+ invalidate_non_owning_refs(env);
- /* Complain on error because this reference state cannot
- * be freed before this point, as bpf_spin_lock critical
- * section does not allow functions that release the
- * allocated object immediately.
- */
- if (!fstate->refs[i].release_on_unlock)
- continue;
- err = release_reference(env, fstate->refs[i].id);
- if (err) {
- verbose(env, "failed to release release_on_unlock reference");
- return err;
- }
- }
+ cur->active_lock.ptr = NULL;
+ cur->active_lock.id = 0;
}
return 0;
}
@@ -5934,6 +6210,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ int spi = 0;
/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
* ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
@@ -5944,12 +6221,14 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
}
/* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
* check_func_arg_reg_off's logic. We only need to check offset
- * alignment for PTR_TO_STACK.
+ * and its alignment for PTR_TO_STACK.
*/
- if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) {
- verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off);
- return -EINVAL;
+ if (reg->type == PTR_TO_STACK) {
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0 && spi != -ERANGE)
+ return spi;
}
+
/* MEM_UNINIT - Points to memory that is an appropriate candidate for
* constructing a mutable bpf_dynptr object.
*
@@ -5966,7 +6245,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
* to.
*/
if (arg_type & MEM_UNINIT) {
- if (!is_dynptr_reg_valid_uninit(env, reg)) {
+ if (!is_dynptr_reg_valid_uninit(env, reg, spi)) {
verbose(env, "Dynptr has to be an uninitialized dynptr\n");
return -EINVAL;
}
@@ -5981,13 +6260,15 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
meta->uninit_dynptr_regno = regno;
} else /* MEM_RDONLY and None case from above */ {
+ int err;
+
/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
return -EINVAL;
}
- if (!is_dynptr_reg_valid_init(env, reg)) {
+ if (!is_dynptr_reg_valid_init(env, reg, spi)) {
verbose(env,
"Expected an initialized dynptr as arg #%d\n",
regno);
@@ -6014,6 +6295,10 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
err_extra, regno);
return -EINVAL;
}
+
+ err = mark_dynptr_read(env, reg);
+ if (err)
+ return err;
}
return 0;
}
@@ -6283,6 +6568,23 @@ found:
return 0;
}
+static struct btf_field *
+reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields)
+{
+ struct btf_field *field;
+ struct btf_record *rec;
+
+ rec = reg_btf_record(reg);
+ if (!rec)
+ return NULL;
+
+ field = btf_record_find(rec, off, fields);
+ if (!field)
+ return NULL;
+
+ return field;
+}
+
int check_func_arg_reg_off(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg, int regno,
enum bpf_arg_type arg_type)
@@ -6304,6 +6606,18 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
*/
if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
return 0;
+
+ if ((type_is_ptr_alloc_obj(type) || type_is_non_owning_ref(type)) && reg->off) {
+ if (reg_find_field_offset(reg, reg->off, BPF_GRAPH_NODE_OR_ROOT))
+ return __check_ptr_off_reg(env, reg, regno, true);
+
+ verbose(env, "R%d must have zero offset when passed to release func\n",
+ regno);
+ verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno,
+ kernel_type_name(reg->btf, reg->btf_id), reg->off);
+ return -EINVAL;
+ }
+
/* Doing check_ptr_off_reg check for the offset will catch this
* because fixed_off_ok is false, but checking here allows us
* to give the user a better error message.
@@ -6338,6 +6652,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
case PTR_TO_BTF_ID | PTR_TRUSTED:
case PTR_TO_BTF_ID | MEM_RCU:
case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
/* When referenced PTR_TO_BTF_ID is passed to release function,
* its fixed offset must be 0. In the other cases, fixed offset
* can be non-zero. This was already checked above. So pass
@@ -6351,15 +6666,29 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
}
}
-static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
int spi;
if (reg->type == CONST_PTR_TO_DYNPTR)
- return reg->ref_obj_id;
+ return reg->id;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return state->stack[spi].spilled_ptr.id;
+}
- spi = get_spi(reg->off);
+static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->ref_obj_id;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
return state->stack[spi].spilled_ptr.ref_obj_id;
}
@@ -6433,9 +6762,8 @@ skip_type_check:
* PTR_TO_STACK.
*/
if (reg->type == PTR_TO_STACK) {
- spi = get_spi(reg->off);
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
- !state->stack[spi].spilled_ptr.ref_obj_id) {
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) {
verbose(env, "arg %d is an unacquired reference\n", regno);
return -EINVAL;
}
@@ -6536,6 +6864,10 @@ skip_type_check:
meta->ret_btf_id = reg->btf_id;
break;
case ARG_PTR_TO_SPIN_LOCK:
+ if (in_rbtree_lock_required_cb(env)) {
+ verbose(env, "can't spin_{lock,unlock} in rbtree cb\n");
+ return -EACCES;
+ }
if (meta->func_id == BPF_FUNC_spin_lock) {
err = process_spin_lock(env, regno, true);
if (err)
@@ -7087,6 +7419,17 @@ static int release_reference(struct bpf_verifier_env *env,
return 0;
}
+static void invalidate_non_owning_refs(struct bpf_verifier_env *env)
+{
+ struct bpf_func_state *unused;
+ struct bpf_reg_state *reg;
+
+ bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+ if (type_is_non_owning_ref(reg->type))
+ __mark_reg_unknown(env, reg);
+ }));
+}
+
static void clear_caller_saved_regs(struct bpf_verifier_env *env,
struct bpf_reg_state *regs)
{
@@ -7108,6 +7451,8 @@ static int set_callee_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee, int insn_idx);
+static bool is_callback_calling_kfunc(u32 btf_id);
+
static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx, int subprog,
set_callee_state_fn set_callee_state_cb)
@@ -7162,10 +7507,18 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
* interested in validating only BPF helpers that can call subprogs as
* callbacks
*/
- if (set_callee_state_cb != set_callee_state && !is_callback_calling_function(insn->imm)) {
- verbose(env, "verifier bug: helper %s#%d is not marked as callback-calling\n",
- func_id_name(insn->imm), insn->imm);
- return -EFAULT;
+ if (set_callee_state_cb != set_callee_state) {
+ if (bpf_pseudo_kfunc_call(insn) &&
+ !is_callback_calling_kfunc(insn->imm)) {
+ verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ } else if (!bpf_pseudo_kfunc_call(insn) &&
+ !is_callback_calling_function(insn->imm)) { /* helper */
+ verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ }
}
if (insn->code == (BPF_JMP | BPF_CALL) &&
@@ -7417,7 +7770,7 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
* callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
*/
__mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
- mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
+ mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
/* unused */
@@ -7430,6 +7783,63 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
return 0;
}
+static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ * bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b));
+ *
+ * 'struct bpf_rb_node *node' arg to bpf_rbtree_add is the same PTR_TO_BTF_ID w/ offset
+ * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd
+ * by this point, so look at 'root'
+ */
+ struct btf_field *field;
+
+ field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off,
+ BPF_RB_ROOT);
+ if (!field || !field->graph_root.value_btf_id)
+ return -EFAULT;
+
+ mark_reg_graph_node(callee->regs, BPF_REG_1, &field->graph_root);
+ ref_set_non_owning(env, &callee->regs[BPF_REG_1]);
+ mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root);
+ ref_set_non_owning(env, &callee->regs[BPF_REG_2]);
+
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
+ return 0;
+}
+
+static bool is_rbtree_lock_required_kfunc(u32 btf_id);
+
+/* Are we currently verifying the callback for a rbtree helper that must
+ * be called with lock held? If so, no need to complain about unreleased
+ * lock
+ */
+static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_insn *insn = env->prog->insnsi;
+ struct bpf_func_state *callee;
+ int kfunc_btf_id;
+
+ if (!state->curframe)
+ return false;
+
+ callee = state->frame[state->curframe];
+
+ if (!callee->in_callback_fn)
+ return false;
+
+ kfunc_btf_id = insn[callee->callsite].imm;
+ return is_rbtree_lock_required_kfunc(kfunc_btf_id);
+}
+
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
@@ -7622,6 +8032,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
struct bpf_reg_state *fmt_reg = &regs[BPF_REG_3];
struct bpf_reg_state *data_len_reg = &regs[BPF_REG_5];
struct bpf_map *fmt_map = fmt_reg->map_ptr;
+ struct bpf_bprintf_data data = {};
int err, fmt_map_off, num_args;
u64 fmt_addr;
char *fmt;
@@ -7646,7 +8057,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
/* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we
* can focus on validating the format specifiers.
*/
- err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, NULL, num_args);
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, num_args, &data);
if (err < 0)
verbose(env, "Invalid format string\n");
@@ -7922,13 +8333,32 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
if (arg_type_is_dynptr(fn->arg_type[i])) {
struct bpf_reg_state *reg = &regs[BPF_REG_1 + i];
+ int id, ref_obj_id;
+
+ if (meta.dynptr_id) {
+ verbose(env, "verifier internal error: meta.dynptr_id already set\n");
+ return -EFAULT;
+ }
if (meta.ref_obj_id) {
verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
return -EFAULT;
}
- meta.ref_obj_id = dynptr_ref_obj_id(env, reg);
+ id = dynptr_id(env, reg);
+ if (id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+ return id;
+ }
+
+ ref_obj_id = dynptr_ref_obj_id(env, reg);
+ if (ref_obj_id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
+ return ref_obj_id;
+ }
+
+ meta.dynptr_id = id;
+ meta.ref_obj_id = ref_obj_id;
break;
}
}
@@ -8084,6 +8514,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EFAULT;
}
+ if (is_dynptr_ref_function(func_id))
+ regs[BPF_REG_0].dynptr_id = meta.dynptr_id;
+
if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
@@ -8175,6 +8608,7 @@ struct bpf_kfunc_call_arg_meta {
bool r0_rdonly;
u32 ret_btf_id;
u64 r0_size;
+ u32 subprogno;
struct {
u64 value;
bool found;
@@ -8186,6 +8620,9 @@ struct bpf_kfunc_call_arg_meta {
struct {
struct btf_field *field;
} arg_list_head;
+ struct {
+ struct btf_field *field;
+ } arg_rbtree_root;
};
static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
@@ -8297,12 +8734,16 @@ enum {
KF_ARG_DYNPTR_ID,
KF_ARG_LIST_HEAD_ID,
KF_ARG_LIST_NODE_ID,
+ KF_ARG_RB_ROOT_ID,
+ KF_ARG_RB_NODE_ID,
};
BTF_ID_LIST(kf_arg_btf_ids)
BTF_ID(struct, bpf_dynptr_kern)
BTF_ID(struct, bpf_list_head)
BTF_ID(struct, bpf_list_node)
+BTF_ID(struct, bpf_rb_root)
+BTF_ID(struct, bpf_rb_node)
static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
const struct btf_param *arg, int type)
@@ -8336,6 +8777,28 @@ static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID);
}
+static bool is_kfunc_arg_rbtree_root(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_ROOT_ID);
+}
+
+static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID);
+}
+
+static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
+ const struct btf_param *arg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_resolve_func_ptr(btf, arg->type, NULL);
+ if (!t)
+ return false;
+
+ return true;
+}
+
/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
const struct btf *btf,
@@ -8395,6 +8858,9 @@ enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */
KF_ARG_PTR_TO_MEM,
KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */
+ KF_ARG_PTR_TO_CALLBACK,
+ KF_ARG_PTR_TO_RB_ROOT,
+ KF_ARG_PTR_TO_RB_NODE,
};
enum special_kfunc_type {
@@ -8408,6 +8874,9 @@ enum special_kfunc_type {
KF_bpf_rdonly_cast,
KF_bpf_rcu_read_lock,
KF_bpf_rcu_read_unlock,
+ KF_bpf_rbtree_remove,
+ KF_bpf_rbtree_add,
+ KF_bpf_rbtree_first,
};
BTF_SET_START(special_kfunc_set)
@@ -8419,6 +8888,9 @@ BTF_ID(func, bpf_list_pop_front)
BTF_ID(func, bpf_list_pop_back)
BTF_ID(func, bpf_cast_to_kern_ctx)
BTF_ID(func, bpf_rdonly_cast)
+BTF_ID(func, bpf_rbtree_remove)
+BTF_ID(func, bpf_rbtree_add)
+BTF_ID(func, bpf_rbtree_first)
BTF_SET_END(special_kfunc_set)
BTF_ID_LIST(special_kfunc_list)
@@ -8432,6 +8904,9 @@ BTF_ID(func, bpf_cast_to_kern_ctx)
BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rcu_read_lock)
BTF_ID(func, bpf_rcu_read_unlock)
+BTF_ID(func, bpf_rbtree_remove)
+BTF_ID(func, bpf_rbtree_add)
+BTF_ID(func, bpf_rbtree_first)
static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -8493,6 +8968,12 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_list_node(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_LIST_NODE;
+ if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RB_ROOT;
+
+ if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RB_NODE;
+
if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
if (!btf_type_is_struct(ref_t)) {
verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
@@ -8502,6 +8983,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
return KF_ARG_PTR_TO_BTF_ID;
}
+ if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_CALLBACK;
+
if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]))
arg_mem_size = true;
@@ -8540,9 +9024,37 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
reg_ref_id = *reg2btf_ids[base_type(reg->type)];
}
- if (is_kfunc_trusted_args(meta) || (is_kfunc_release(meta) && reg->ref_obj_id))
+ /* Enforce strict type matching for calls to kfuncs that are acquiring
+ * or releasing a reference, or are no-cast aliases. We do _not_
+ * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default,
+ * as we want to enable BPF programs to pass types that are bitwise
+ * equivalent without forcing them to explicitly cast with something
+ * like bpf_cast_to_kern_ctx().
+ *
+ * For example, say we had a type like the following:
+ *
+ * struct bpf_cpumask {
+ * cpumask_t cpumask;
+ * refcount_t usage;
+ * };
+ *
+ * Note that as specified in <linux/cpumask.h>, cpumask_t is typedef'ed
+ * to a struct cpumask, so it would be safe to pass a struct
+ * bpf_cpumask * to a kfunc expecting a struct cpumask *.
+ *
+ * The philosophy here is similar to how we allow scalars of different
+ * types to be passed to kfuncs as long as the size is the same. The
+ * only difference here is that we're simply allowing
+ * btf_struct_ids_match() to walk the struct at the 0th offset, and
+ * resolve types.
+ */
+ if (is_kfunc_acquire(meta) ||
+ (is_kfunc_release(meta) && reg->ref_obj_id) ||
+ btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
strict_type_match = true;
+ WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off);
+
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) {
@@ -8588,38 +9100,54 @@ static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env,
return 0;
}
-static int ref_set_release_on_unlock(struct bpf_verifier_env *env, u32 ref_obj_id)
+static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_func_state *state = cur_func(env);
+ struct bpf_verifier_state *state = env->cur_state;
+
+ if (!state->active_lock.ptr) {
+ verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
+ return -EFAULT;
+ }
+
+ if (type_flag(reg->type) & NON_OWN_REF) {
+ verbose(env, "verifier internal error: NON_OWN_REF already set\n");
+ return -EFAULT;
+ }
+
+ reg->type |= NON_OWN_REF;
+ return 0;
+}
+
+static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id)
+{
+ struct bpf_func_state *state, *unused;
struct bpf_reg_state *reg;
int i;
- /* bpf_spin_lock only allows calling list_push and list_pop, no BPF
- * subprogs, no global functions. This means that the references would
- * not be released inside the critical section but they may be added to
- * the reference state, and the acquired_refs are never copied out for a
- * different frame as BPF to BPF calls don't work in bpf_spin_lock
- * critical sections.
- */
+ state = cur_func(env);
+
if (!ref_obj_id) {
- verbose(env, "verifier internal error: ref_obj_id is zero for release_on_unlock\n");
+ verbose(env, "verifier internal error: ref_obj_id is zero for "
+ "owning -> non-owning conversion\n");
return -EFAULT;
}
+
for (i = 0; i < state->acquired_refs; i++) {
- if (state->refs[i].id == ref_obj_id) {
- if (state->refs[i].release_on_unlock) {
- verbose(env, "verifier internal error: expected false release_on_unlock");
- return -EFAULT;
+ if (state->refs[i].id != ref_obj_id)
+ continue;
+
+ /* Clear ref_obj_id here so release_reference doesn't clobber
+ * the whole reg
+ */
+ bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+ if (reg->ref_obj_id == ref_obj_id) {
+ reg->ref_obj_id = 0;
+ ref_set_non_owning(env, reg);
}
- state->refs[i].release_on_unlock = true;
- /* Now mark everyone sharing same ref_obj_id as untrusted */
- bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
- if (reg->ref_obj_id == ref_obj_id)
- reg->type |= PTR_UNTRUSTED;
- }));
- return 0;
- }
+ }));
+ return 0;
}
+
verbose(env, "verifier internal error: ref state missing for ref_obj_id\n");
return -EFAULT;
}
@@ -8705,101 +9233,226 @@ static bool is_bpf_list_api_kfunc(u32 btf_id)
btf_id == special_kfunc_list[KF_bpf_list_pop_back];
}
-static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg, u32 regno,
- struct bpf_kfunc_call_arg_meta *meta)
+static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_first];
+}
+
+static bool is_bpf_graph_api_kfunc(u32 btf_id)
+{
+ return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id);
+}
+
+static bool is_callback_calling_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add];
+}
+
+static bool is_rbtree_lock_required_kfunc(u32 btf_id)
+{
+ return is_bpf_rbtree_api_kfunc(btf_id);
+}
+
+static bool check_kfunc_is_graph_root_api(struct bpf_verifier_env *env,
+ enum btf_field_type head_field_type,
+ u32 kfunc_btf_id)
{
+ bool ret;
+
+ switch (head_field_type) {
+ case BPF_LIST_HEAD:
+ ret = is_bpf_list_api_kfunc(kfunc_btf_id);
+ break;
+ case BPF_RB_ROOT:
+ ret = is_bpf_rbtree_api_kfunc(kfunc_btf_id);
+ break;
+ default:
+ verbose(env, "verifier internal error: unexpected graph root argument type %s\n",
+ btf_field_type_name(head_field_type));
+ return false;
+ }
+
+ if (!ret)
+ verbose(env, "verifier internal error: %s head arg for unknown kfunc\n",
+ btf_field_type_name(head_field_type));
+ return ret;
+}
+
+static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
+ enum btf_field_type node_field_type,
+ u32 kfunc_btf_id)
+{
+ bool ret;
+
+ switch (node_field_type) {
+ case BPF_LIST_NODE:
+ ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back]);
+ break;
+ case BPF_RB_NODE:
+ ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add]);
+ break;
+ default:
+ verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
+ btf_field_type_name(node_field_type));
+ return false;
+ }
+
+ if (!ret)
+ verbose(env, "verifier internal error: %s node arg for unknown kfunc\n",
+ btf_field_type_name(node_field_type));
+ return ret;
+}
+
+static int
+__process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta,
+ enum btf_field_type head_field_type,
+ struct btf_field **head_field)
+{
+ const char *head_type_name;
struct btf_field *field;
struct btf_record *rec;
- u32 list_head_off;
+ u32 head_off;
- if (meta->btf != btf_vmlinux || !is_bpf_list_api_kfunc(meta->func_id)) {
- verbose(env, "verifier internal error: bpf_list_head argument for unknown kfunc\n");
+ if (meta->btf != btf_vmlinux) {
+ verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");
return -EFAULT;
}
+ if (!check_kfunc_is_graph_root_api(env, head_field_type, meta->func_id))
+ return -EFAULT;
+
+ head_type_name = btf_field_type_name(head_field_type);
if (!tnum_is_const(reg->var_off)) {
verbose(env,
- "R%d doesn't have constant offset. bpf_list_head has to be at the constant offset\n",
- regno);
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, head_type_name);
return -EINVAL;
}
rec = reg_btf_record(reg);
- list_head_off = reg->off + reg->var_off.value;
- field = btf_record_find(rec, list_head_off, BPF_LIST_HEAD);
+ head_off = reg->off + reg->var_off.value;
+ field = btf_record_find(rec, head_off, head_field_type);
if (!field) {
- verbose(env, "bpf_list_head not found at offset=%u\n", list_head_off);
+ verbose(env, "%s not found at offset=%u\n", head_type_name, head_off);
return -EINVAL;
}
/* All functions require bpf_list_head to be protected using a bpf_spin_lock */
if (check_reg_allocation_locked(env, reg)) {
- verbose(env, "bpf_spin_lock at off=%d must be held for bpf_list_head\n",
- rec->spin_lock_off);
+ verbose(env, "bpf_spin_lock at off=%d must be held for %s\n",
+ rec->spin_lock_off, head_type_name);
return -EINVAL;
}
- if (meta->arg_list_head.field) {
- verbose(env, "verifier internal error: repeating bpf_list_head arg\n");
+ if (*head_field) {
+ verbose(env, "verifier internal error: repeating %s arg\n", head_type_name);
return -EFAULT;
}
- meta->arg_list_head.field = field;
+ *head_field = field;
return 0;
}
-static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
+static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, u32 regno,
struct bpf_kfunc_call_arg_meta *meta)
{
+ return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD,
+ &meta->arg_list_head.field);
+}
+
+static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT,
+ &meta->arg_rbtree_root.field);
+}
+
+static int
+__process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta,
+ enum btf_field_type head_field_type,
+ enum btf_field_type node_field_type,
+ struct btf_field **node_field)
+{
+ const char *node_type_name;
const struct btf_type *et, *t;
struct btf_field *field;
- struct btf_record *rec;
- u32 list_node_off;
+ u32 node_off;
- if (meta->btf != btf_vmlinux ||
- (meta->func_id != special_kfunc_list[KF_bpf_list_push_front] &&
- meta->func_id != special_kfunc_list[KF_bpf_list_push_back])) {
- verbose(env, "verifier internal error: bpf_list_node argument for unknown kfunc\n");
+ if (meta->btf != btf_vmlinux) {
+ verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");
return -EFAULT;
}
+ if (!check_kfunc_is_graph_node_api(env, node_field_type, meta->func_id))
+ return -EFAULT;
+
+ node_type_name = btf_field_type_name(node_field_type);
if (!tnum_is_const(reg->var_off)) {
verbose(env,
- "R%d doesn't have constant offset. bpf_list_node has to be at the constant offset\n",
- regno);
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, node_type_name);
return -EINVAL;
}
- rec = reg_btf_record(reg);
- list_node_off = reg->off + reg->var_off.value;
- field = btf_record_find(rec, list_node_off, BPF_LIST_NODE);
- if (!field || field->offset != list_node_off) {
- verbose(env, "bpf_list_node not found at offset=%u\n", list_node_off);
+ node_off = reg->off + reg->var_off.value;
+ field = reg_find_field_offset(reg, node_off, node_field_type);
+ if (!field || field->offset != node_off) {
+ verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);
return -EINVAL;
}
- field = meta->arg_list_head.field;
+ field = *node_field;
- et = btf_type_by_id(field->list_head.btf, field->list_head.value_btf_id);
+ et = btf_type_by_id(field->graph_root.btf, field->graph_root.value_btf_id);
t = btf_type_by_id(reg->btf, reg->btf_id);
- if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->list_head.btf,
- field->list_head.value_btf_id, true)) {
- verbose(env, "operation on bpf_list_head expects arg#1 bpf_list_node at offset=%d "
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->graph_root.btf,
+ field->graph_root.value_btf_id, true)) {
+ verbose(env, "operation on %s expects arg#1 %s at offset=%d "
"in struct %s, but arg is at offset=%d in struct %s\n",
- field->list_head.node_offset, btf_name_by_offset(field->list_head.btf, et->name_off),
- list_node_off, btf_name_by_offset(reg->btf, t->name_off));
+ btf_field_type_name(head_field_type),
+ btf_field_type_name(node_field_type),
+ field->graph_root.node_offset,
+ btf_name_by_offset(field->graph_root.btf, et->name_off),
+ node_off, btf_name_by_offset(reg->btf, t->name_off));
return -EINVAL;
}
- if (list_node_off != field->list_head.node_offset) {
- verbose(env, "arg#1 offset=%d, but expected bpf_list_node at offset=%d in struct %s\n",
- list_node_off, field->list_head.node_offset,
- btf_name_by_offset(field->list_head.btf, et->name_off));
+ if (node_off != field->graph_root.node_offset) {
+ verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n",
+ node_off, btf_field_type_name(node_field_type),
+ field->graph_root.node_offset,
+ btf_name_by_offset(field->graph_root.btf, et->name_off));
return -EINVAL;
}
- /* Set arg#1 for expiration after unlock */
- return ref_set_release_on_unlock(env, reg->ref_obj_id);
+
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_LIST_HEAD, BPF_LIST_NODE,
+ &meta->arg_list_head.field);
+}
+
+static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_RB_ROOT, BPF_RB_NODE,
+ &meta->arg_rbtree_root.field);
}
static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta)
@@ -8885,6 +9538,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
+ if (is_kfunc_trusted_args(meta) &&
+ (register_is_null(reg) || type_may_be_null(reg->type))) {
+ verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
+ return -EACCES;
+ }
+
if (reg->ref_obj_id) {
if (is_kfunc_release(meta) && meta->ref_obj_id) {
verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
@@ -8930,8 +9589,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_DYNPTR:
case KF_ARG_PTR_TO_LIST_HEAD:
case KF_ARG_PTR_TO_LIST_NODE:
+ case KF_ARG_PTR_TO_RB_ROOT:
+ case KF_ARG_PTR_TO_RB_NODE:
case KF_ARG_PTR_TO_MEM:
case KF_ARG_PTR_TO_MEM_SIZE:
+ case KF_ARG_PTR_TO_CALLBACK:
/* Trusted by default */
break;
default:
@@ -9008,6 +9670,20 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (ret < 0)
return ret;
break;
+ case KF_ARG_PTR_TO_RB_ROOT:
+ if (reg->type != PTR_TO_MAP_VALUE &&
+ reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
case KF_ARG_PTR_TO_LIST_NODE:
if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
verbose(env, "arg#%d expected pointer to allocated object\n", i);
@@ -9021,6 +9697,31 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (ret < 0)
return ret;
break;
+ case KF_ARG_PTR_TO_RB_NODE:
+ if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) {
+ if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) {
+ verbose(env, "rbtree_remove node input must be non-owning ref\n");
+ return -EINVAL;
+ }
+ if (in_rbtree_lock_required_cb(env)) {
+ verbose(env, "rbtree_remove not allowed in rbtree cb\n");
+ return -EINVAL;
+ }
+ } else {
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ }
+
+ ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
case KF_ARG_PTR_TO_BTF_ID:
/* Only base_type is checked, further checks are done here */
if ((base_type(reg->type) != PTR_TO_BTF_ID ||
@@ -9056,6 +9757,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
/* Skip next '__sz' argument */
i++;
break;
+ case KF_ARG_PTR_TO_CALLBACK:
+ meta->subprogno = reg->subprogno;
+ break;
}
}
@@ -9072,11 +9776,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
const struct btf_type *t, *func, *func_proto, *ptr_type;
+ u32 i, nargs, func_id, ptr_type_id, release_ref_obj_id;
struct bpf_reg_state *regs = cur_regs(env);
const char *func_name, *ptr_type_name;
bool sleepable, rcu_lock, rcu_unlock;
struct bpf_kfunc_call_arg_meta meta;
- u32 i, nargs, func_id, ptr_type_id;
int err, insn_idx = *insn_idx_p;
const struct btf_param *args;
const struct btf_type *ret_t;
@@ -9171,6 +9875,35 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front] ||
+ meta.func_id == special_kfunc_list[KF_bpf_list_push_back] ||
+ meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) {
+ release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
+ err = ref_convert_owning_non_owning(env, release_ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
+ func_name, func_id);
+ return err;
+ }
+
+ err = release_reference(env, release_ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+ func_name, func_id);
+ return err;
+ }
+ }
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) {
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_rbtree_add_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, func_id);
+ return err;
+ }
+ }
+
for (i = 0; i < CALLER_SAVED_REGS; i++)
mark_reg_not_init(env, regs, caller_saved[i]);
@@ -9235,11 +9968,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
struct btf_field *field = meta.arg_list_head.field;
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
- regs[BPF_REG_0].btf = field->list_head.btf;
- regs[BPF_REG_0].btf_id = field->list_head.value_btf_id;
- regs[BPF_REG_0].off = field->list_head.node_offset;
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
+ struct btf_field *field = meta.arg_rbtree_root.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
} else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
@@ -9305,7 +10039,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (is_kfunc_ret_null(&meta))
regs[BPF_REG_0].id = id;
regs[BPF_REG_0].ref_obj_id = id;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
+ ref_set_non_owning(env, &regs[BPF_REG_0]);
}
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove])
+ invalidate_non_owning_refs(env);
+
if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
regs[BPF_REG_0].id = ++env->id_gen;
} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
@@ -9592,7 +10332,7 @@ do_sim:
*/
if (!ptr_is_dst_reg) {
tmp = *dst_reg;
- *dst_reg = *ptr_reg;
+ copy_register_state(dst_reg, ptr_reg);
}
ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
env->insn_idx);
@@ -10845,7 +11585,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* to propagate min/max range.
*/
src_reg->id = ++env->id_gen;
- *dst_reg = *src_reg;
+ copy_register_state(dst_reg, src_reg);
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = DEF_NOT_SUBREG;
} else {
@@ -10856,7 +11596,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
- *dst_reg = *src_reg;
+ copy_register_state(dst_reg, src_reg);
/* Make sure ID is cleared otherwise
* dst_reg min/max could be incorrectly
* propagated into src_reg by find_equal_scalars()
@@ -11491,8 +12231,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
*/
if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0)))
return;
- if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL) && WARN_ON_ONCE(reg->off))
+ if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) &&
+ WARN_ON_ONCE(reg->off))
return;
+
if (is_null) {
reg->type = SCALAR_VALUE;
/* We don't need id and ref_obj_id from this point
@@ -11655,7 +12397,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
bpf_for_each_reg_in_vstate(vstate, state, reg, ({
if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
- *reg = *known_reg;
+ copy_register_state(reg, known_reg);
}));
}
@@ -12958,6 +13700,13 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap)
{
unsigned int i;
+ /* either both IDs should be set or both should be zero */
+ if (!!old_id != !!cur_id)
+ return false;
+
+ if (old_id == 0) /* cur_id == 0 as well */
+ return true;
+
for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
if (!idmap[i].old) {
/* Reached an empty slot; haven't seen this id before */
@@ -13069,79 +13818,74 @@ next:
}
}
+static bool regs_exact(const struct bpf_reg_state *rold,
+ const struct bpf_reg_state *rcur,
+ struct bpf_id_pair *idmap)
+{
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
+}
+
/* Returns true if (rold safe implies rcur safe) */
static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)
{
- bool equal;
-
if (!(rold->live & REG_LIVE_READ))
/* explored state didn't use this */
return true;
-
- equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0;
-
if (rold->type == NOT_INIT)
/* explored state can't have used this */
return true;
if (rcur->type == NOT_INIT)
return false;
+
+ /* Enforce that register types have to match exactly, including their
+ * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
+ * rule.
+ *
+ * One can make a point that using a pointer register as unbounded
+ * SCALAR would be technically acceptable, but this could lead to
+ * pointer leaks because scalars are allowed to leak while pointers
+ * are not. We could make this safe in special cases if root is
+ * calling us, but it's probably not worth the hassle.
+ *
+ * Also, register types that are *not* MAYBE_NULL could technically be
+ * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE
+ * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point
+ * to the same map).
+ * However, if the old MAYBE_NULL register then got NULL checked,
+ * doing so could have affected others with the same id, and we can't
+ * check for that because we lost the id when we converted to
+ * a non-MAYBE_NULL variant.
+ * So, as a general rule we don't allow mixing MAYBE_NULL and
+ * non-MAYBE_NULL registers as well.
+ */
+ if (rold->type != rcur->type)
+ return false;
+
switch (base_type(rold->type)) {
case SCALAR_VALUE:
- if (equal)
+ if (regs_exact(rold, rcur, idmap))
return true;
if (env->explore_alu_limits)
return false;
- if (rcur->type == SCALAR_VALUE) {
- if (!rold->precise)
- return true;
- /* new val must satisfy old val knowledge */
- return range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
- } else {
- /* We're trying to use a pointer in place of a scalar.
- * Even if the scalar was unbounded, this could lead to
- * pointer leaks because scalars are allowed to leak
- * while pointers are not. We could make this safe in
- * special cases if root is calling us, but it's
- * probably not worth the hassle.
- */
- return false;
- }
+ if (!rold->precise)
+ return true;
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
- /* a PTR_TO_MAP_VALUE could be safe to use as a
- * PTR_TO_MAP_VALUE_OR_NULL into the same map.
- * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
- * checked, doing so could have affected others with the same
- * id, and we can't check for that because we lost the id when
- * we converted to a PTR_TO_MAP_VALUE.
- */
- if (type_may_be_null(rold->type)) {
- if (!type_may_be_null(rcur->type))
- return false;
- if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
- return false;
- /* Check our ids match any regs they're supposed to */
- return check_ids(rold->id, rcur->id, idmap);
- }
-
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
- * 'id' is not compared, since it's only used for maps with
- * bpf_spin_lock inside map element and in such cases if
- * the rest of the prog is valid for one map element then
- * it's valid for all map elements regardless of the key
- * used in bpf_map_lookup()
*/
- return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off) &&
check_ids(rold->id, rcur->id, idmap);
case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
- if (rcur->type != rold->type)
- return false;
/* We must have at least as much range as the old ptr
* did, so that any accesses which were safe before are
* still safe. This is true even if old range < old off,
@@ -13156,7 +13900,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
if (rold->off != rcur->off)
return false;
/* id relations must be preserved */
- if (rold->id && !check_ids(rold->id, rcur->id, idmap))
+ if (!check_ids(rold->id, rcur->id, idmap))
return false;
/* new val must satisfy old val knowledge */
return range_within(rold, rcur) &&
@@ -13165,15 +13909,10 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
/* two stack pointers are equal only if they're pointing to
* the same stack frame, since fp-8 in foo != fp-8 in bar
*/
- return equal && rold->frameno == rcur->frameno;
+ return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
default:
- /* Only valid matches are exact, which memcmp() */
- return equal;
+ return regs_exact(rold, rcur, idmap);
}
-
- /* Shouldn't get here; if we do, say it's not safe */
- WARN_ON_ONCE(1);
- return false;
}
static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
@@ -13220,10 +13959,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
return false;
if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
continue;
- if (!is_spilled_reg(&old->stack[spi]))
- continue;
- if (!regsafe(env, &old->stack[spi].spilled_ptr,
- &cur->stack[spi].spilled_ptr, idmap))
+ /* Both old and cur are having same slot_type */
+ switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
/* when explored and current stack slot are both storing
* spilled registers, check that stored pointers types
* are the same as well.
@@ -13234,17 +13972,48 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
* such verifier states are not equivalent.
* return false to continue verification of this path
*/
+ if (!regsafe(env, &old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr, idmap))
+ return false;
+ break;
+ case STACK_DYNPTR:
+ {
+ const struct bpf_reg_state *old_reg, *cur_reg;
+
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (old_reg->dynptr.type != cur_reg->dynptr.type ||
+ old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
+ }
+ case STACK_MISC:
+ case STACK_ZERO:
+ case STACK_INVALID:
+ continue;
+ /* Ensure that new unhandled slot types return false by default */
+ default:
return false;
+ }
}
return true;
}
-static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur)
+static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
+ struct bpf_id_pair *idmap)
{
+ int i;
+
if (old->acquired_refs != cur->acquired_refs)
return false;
- return !memcmp(old->refs, cur->refs,
- sizeof(*old->refs) * old->acquired_refs);
+
+ for (i = 0; i < old->acquired_refs; i++) {
+ if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap))
+ return false;
+ }
+
+ return true;
}
/* compare two verifier states
@@ -13286,7 +14055,7 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
if (!stacksafe(env, old, cur, env->idmap_scratch))
return false;
- if (!refsafe(old, cur))
+ if (!refsafe(old, cur, env->idmap_scratch))
return false;
return true;
@@ -13823,7 +14592,7 @@ static int do_check(struct bpf_verifier_env *env)
env->prev_log_len = env->log.len_used;
}
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
err = bpf_prog_offload_verify_insn(env, env->insn_idx,
env->prev_insn_idx);
if (err)
@@ -13974,7 +14743,7 @@ static int do_check(struct bpf_verifier_env *env)
if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||
(insn->src_reg == BPF_PSEUDO_CALL) ||
(insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
- (insn->off != 0 || !is_bpf_list_api_kfunc(insn->imm)))) {
+ (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) {
verbose(env, "function calls are not allowed while holding a lock\n");
return -EINVAL;
}
@@ -14010,7 +14779,8 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (env->cur_state->active_lock.ptr) {
+ if (env->cur_state->active_lock.ptr &&
+ !in_rbtree_lock_required_cb(env)) {
verbose(env, "bpf_spin_unlock is missing\n");
return -EINVAL;
}
@@ -14272,9 +15042,10 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
{
enum bpf_prog_type prog_type = resolve_prog_type(prog);
- if (btf_record_has_field(map->record, BPF_LIST_HEAD)) {
+ if (btf_record_has_field(map->record, BPF_LIST_HEAD) ||
+ btf_record_has_field(map->record, BPF_RB_ROOT)) {
if (is_tracing_prog_type(prog_type)) {
- verbose(env, "tracing progs cannot use bpf_list_head yet\n");
+ verbose(env, "tracing progs cannot use bpf_{list_head,rb_root} yet\n");
return -EINVAL;
}
}
@@ -14303,7 +15074,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
- if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
+ if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
return -EINVAL;
@@ -14784,7 +15555,7 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
unsigned int orig_prog_len = env->prog->len;
int err;
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_remove_insns(env, off, cnt);
err = bpf_remove_insns(env->prog, off, cnt);
@@ -14865,7 +15636,7 @@ static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
else
continue;
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_replace_insn(env, i, &ja);
memcpy(insn, &ja, sizeof(ja));
@@ -15052,7 +15823,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
}
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
return 0;
insn = env->prog->insnsi + delta;
@@ -15452,7 +16223,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
int err = 0;
if (env->prog->jit_requested &&
- !bpf_prog_is_dev_bound(env->prog->aux)) {
+ !bpf_prog_is_offloaded(env->prog->aux)) {
err = jit_subprogs(env);
if (err == 0)
return 0;
@@ -15496,12 +16267,25 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
const struct bpf_kfunc_desc *desc;
+ void *xdp_kfunc;
if (!insn->imm) {
verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
return -EINVAL;
}
+ *cnt = 0;
+
+ if (bpf_dev_bound_kfunc_id(insn->imm)) {
+ xdp_kfunc = bpf_dev_bound_resolve_kfunc(env->prog, insn->imm);
+ if (xdp_kfunc) {
+ insn->imm = BPF_CALL_IMM(xdp_kfunc);
+ return 0;
+ }
+
+ /* fallback to default kfunc when not supported by netdev */
+ }
+
/* insn->imm has the btf func_id. Replace it with
* an address (relative to __bpf_call_base).
*/
@@ -15512,7 +16296,6 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EFAULT;
}
- *cnt = 0;
insn->imm = desc->imm;
if (insn->off)
return 0;
@@ -16438,7 +17221,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
if (st_ops->check_member) {
- int err = st_ops->check_member(t, member);
+ int err = st_ops->check_member(t, member, prog);
if (err) {
verbose(env, "attach to unsupported member %s of struct %s\n",
@@ -16519,6 +17302,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
if (tgt_prog) {
struct bpf_prog_aux *aux = tgt_prog->aux;
+ if (bpf_prog_is_dev_bound(prog->aux) &&
+ !bpf_prog_dev_bound_match(prog, tgt_prog)) {
+ bpf_log(log, "Target program bound device mismatch");
+ return -EINVAL;
+ }
+
for (i = 0; i < aux->func_info_cnt; i++)
if (aux->func_info[i].type_id == btf_id) {
subprog = i;
@@ -16740,6 +17529,24 @@ BTF_ID(func, rcu_read_unlock_strict)
#endif
BTF_SET_END(btf_id_deny)
+static bool can_be_sleepable(struct bpf_prog *prog)
+{
+ if (prog->type == BPF_PROG_TYPE_TRACING) {
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
+ case BPF_TRACE_ITER:
+ return true;
+ default:
+ return false;
+ }
+ }
+ return prog->type == BPF_PROG_TYPE_LSM ||
+ prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
+ prog->type == BPF_PROG_TYPE_STRUCT_OPS;
+}
+
static int check_attach_btf_id(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
@@ -16758,9 +17565,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
- prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE) {
- verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe programs can be sleepable\n");
+ if (prog->aux->sleepable && !can_be_sleepable(prog)) {
+ verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
return -EINVAL;
}
@@ -16939,7 +17745,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
if (ret < 0)
goto skip_full_check;
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
ret = bpf_prog_offload_verifier_prep(env->prog);
if (ret)
goto skip_full_check;
@@ -16952,7 +17758,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
ret = do_check_subprogs(env);
ret = ret ?: do_check_main(env);
- if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
+ if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))
ret = bpf_prog_offload_finalize(env);
skip_full_check:
@@ -16987,7 +17793,7 @@ skip_full_check:
/* do 32-bit optimization after insn patching has done so those patched
* insns could be handled correctly.
*/
- if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) {
ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
: false;
diff --git a/kernel/capability.c b/kernel/capability.c
index 860fd22117c1..3e058f41df32 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -20,13 +20,6 @@
#include <linux/user_namespace.h>
#include <linux/uaccess.h>
-/*
- * Leveraged for setting/resetting capabilities
- */
-
-const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-EXPORT_SYMBOL(__cap_empty_set);
-
int file_caps_enabled = 1;
static int __init file_caps_disable(char *str)
@@ -151,6 +144,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
pid_t pid;
unsigned tocopy;
kernel_cap_t pE, pI, pP;
+ struct __user_cap_data_struct kdata[2];
ret = cap_validate_magic(header, &tocopy);
if ((dataptr == NULL) || (ret != 0))
@@ -163,42 +157,46 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
return -EINVAL;
ret = cap_get_target_pid(pid, &pE, &pI, &pP);
- if (!ret) {
- struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
- unsigned i;
-
- for (i = 0; i < tocopy; i++) {
- kdata[i].effective = pE.cap[i];
- kdata[i].permitted = pP.cap[i];
- kdata[i].inheritable = pI.cap[i];
- }
-
- /*
- * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
- * we silently drop the upper capabilities here. This
- * has the effect of making older libcap
- * implementations implicitly drop upper capability
- * bits when they perform a: capget/modify/capset
- * sequence.
- *
- * This behavior is considered fail-safe
- * behavior. Upgrading the application to a newer
- * version of libcap will enable access to the newer
- * capabilities.
- *
- * An alternative would be to return an error here
- * (-ERANGE), but that causes legacy applications to
- * unexpectedly fail; the capget/modify/capset aborts
- * before modification is attempted and the application
- * fails.
- */
- if (copy_to_user(dataptr, kdata, tocopy
- * sizeof(struct __user_cap_data_struct))) {
- return -EFAULT;
- }
- }
+ if (ret)
+ return ret;
- return ret;
+ /*
+ * Annoying legacy format with 64-bit capabilities exposed
+ * as two sets of 32-bit fields, so we need to split the
+ * capability values up.
+ */
+ kdata[0].effective = pE.val; kdata[1].effective = pE.val >> 32;
+ kdata[0].permitted = pP.val; kdata[1].permitted = pP.val >> 32;
+ kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32;
+
+ /*
+ * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
+ * we silently drop the upper capabilities here. This
+ * has the effect of making older libcap
+ * implementations implicitly drop upper capability
+ * bits when they perform a: capget/modify/capset
+ * sequence.
+ *
+ * This behavior is considered fail-safe
+ * behavior. Upgrading the application to a newer
+ * version of libcap will enable access to the newer
+ * capabilities.
+ *
+ * An alternative would be to return an error here
+ * (-ERANGE), but that causes legacy applications to
+ * unexpectedly fail; the capget/modify/capset aborts
+ * before modification is attempted and the application
+ * fails.
+ */
+ if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0])))
+ return -EFAULT;
+
+ return 0;
+}
+
+static kernel_cap_t mk_kernel_cap(u32 low, u32 high)
+{
+ return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK };
}
/**
@@ -221,8 +219,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
*/
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
- struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
- unsigned i, tocopy, copybytes;
+ struct __user_cap_data_struct kdata[2] = { { 0, }, };
+ unsigned tocopy, copybytes;
kernel_cap_t inheritable, permitted, effective;
struct cred *new;
int ret;
@@ -246,21 +244,9 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
if (copy_from_user(&kdata, data, copybytes))
return -EFAULT;
- for (i = 0; i < tocopy; i++) {
- effective.cap[i] = kdata[i].effective;
- permitted.cap[i] = kdata[i].permitted;
- inheritable.cap[i] = kdata[i].inheritable;
- }
- while (i < _KERNEL_CAPABILITY_U32S) {
- effective.cap[i] = 0;
- permitted.cap[i] = 0;
- inheritable.cap[i] = 0;
- i++;
- }
-
- effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
- permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
- inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ effective = mk_kernel_cap(kdata[0].effective, kdata[1].effective);
+ permitted = mk_kernel_cap(kdata[0].permitted, kdata[1].permitted);
+ inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable);
new = prepare_creds();
if (!new)
@@ -486,11 +472,11 @@ EXPORT_SYMBOL(file_ns_capable);
* Return true if the inode uid and gid are within the namespace.
*/
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
- struct user_namespace *mnt_userns,
+ struct mnt_idmap *idmap,
const struct inode *inode)
{
- return vfsuid_has_mapping(ns, i_uid_into_vfsuid(mnt_userns, inode)) &&
- vfsgid_has_mapping(ns, i_gid_into_vfsgid(mnt_userns, inode));
+ return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) &&
+ vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode));
}
/**
@@ -502,13 +488,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
-bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns,
+bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
return ns_capable(ns, cap) &&
- privileged_wrt_inode_uidgid(ns, mnt_userns, inode);
+ privileged_wrt_inode_uidgid(ns, idmap, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c099cf3fa02d..935e8121b21e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5065,7 +5065,7 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
if (!inode)
return -ENOMEM;
- ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
+ ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);
iput(inode);
return ret;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index a29c0b13706b..636f1c682ac0 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1205,12 +1205,13 @@ void rebuild_sched_domains(void)
/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ * @new_cpus: the temp variable for the new effective_cpus mask
*
* Iterate through each task of @cs updating its cpus_allowed to the
* effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
-static void update_tasks_cpumask(struct cpuset *cs)
+static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
{
struct css_task_iter it;
struct task_struct *task;
@@ -1224,7 +1225,10 @@ static void update_tasks_cpumask(struct cpuset *cs)
if (top_cs && (task->flags & PF_KTHREAD) &&
kthread_is_per_cpu(task))
continue;
- set_cpus_allowed_ptr(task, cs->effective_cpus);
+
+ cpumask_and(new_cpus, cs->effective_cpus,
+ task_cpu_possible_mask(task));
+ set_cpus_allowed_ptr(task, new_cpus);
}
css_task_iter_end(&it);
}
@@ -1267,7 +1271,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int turning_on);
/**
* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
- * @cpuset: The cpuset that requests change in partition root state
+ * @cs: The cpuset that requests change in partition root state
* @cmd: Partition root state change command
* @newmask: Optional new cpumask for partcmd_update
* @tmp: Temporary addmask and delmask
@@ -1346,7 +1350,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
* A parent can be left with no CPU as long as there is no
* task directly associated with the parent partition.
*/
- if (!cpumask_intersects(cs->cpus_allowed, parent->effective_cpus) &&
+ if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) &&
partition_is_populated(parent, cs))
return PERR_NOCPUS;
@@ -1509,7 +1513,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
spin_unlock_irq(&callback_lock);
if (adding || deleting)
- update_tasks_cpumask(parent);
+ update_tasks_cpumask(parent, tmp->new_cpus);
/*
* Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
@@ -1661,7 +1665,7 @@ update_parent_subparts:
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
- update_tasks_cpumask(cp);
+ update_tasks_cpumask(cp, tmp->new_cpus);
/*
* On legacy hierarchy, if the effective cpumask of any non-
@@ -2309,7 +2313,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
}
}
- update_tasks_cpumask(parent);
+ update_tasks_cpumask(parent, tmpmask.new_cpus);
if (parent->child_ecpus_count)
update_sibling_cpumasks(parent, cs, &tmpmask);
@@ -2324,6 +2328,7 @@ out:
new_prs = -new_prs;
spin_lock_irq(&callback_lock);
cs->partition_root_state = new_prs;
+ WRITE_ONCE(cs->prs_err, err);
spin_unlock_irq(&callback_lock);
/*
* Update child cpusets, if present.
@@ -3281,8 +3286,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
int __init cpuset_init(void)
{
- BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
-
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
@@ -3347,7 +3350,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
* as the tasks will be migrated to an ancestor.
*/
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
- update_tasks_cpumask(cs);
+ update_tasks_cpumask(cs, new_cpus);
if (mems_updated && !nodes_empty(cs->mems_allowed))
update_tasks_nodemask(cs);
@@ -3384,7 +3387,7 @@ hotplug_update_tasks(struct cpuset *cs,
spin_unlock_irq(&callback_lock);
if (cpus_updated)
- update_tasks_cpumask(cs);
+ update_tasks_cpumask(cs, new_cpus);
if (mems_updated)
update_tasks_nodemask(cs);
}
@@ -3691,15 +3694,38 @@ void __init cpuset_init_smp(void)
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
* subset of cpu_online_mask, even if this means going outside the
- * tasks cpuset.
+ * tasks cpuset, except when the task is in the top cpuset.
**/
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
unsigned long flags;
+ struct cpuset *cs;
spin_lock_irqsave(&callback_lock, flags);
- guarantee_online_cpus(tsk, pmask);
+ rcu_read_lock();
+
+ cs = task_cs(tsk);
+ if (cs != &top_cpuset)
+ guarantee_online_cpus(tsk, pmask);
+ /*
+ * Tasks in the top cpuset won't get update to their cpumasks
+ * when a hotplug online/offline event happens. So we include all
+ * offline cpus in the allowed cpu list.
+ */
+ if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+
+ /*
+ * We first exclude cpus allocated to partitions. If there is no
+ * allowable online cpu left, we fall back to all possible cpus.
+ */
+ cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus);
+ if (!cpumask_intersects(pmask, cpu_online_mask))
+ cpumask_copy(pmask, possible_mask);
+ }
+
+ rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
}
@@ -3879,8 +3905,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
}
/**
- * cpuset_mem_spread_node() - On which node to begin search for a file page
- * cpuset_slab_spread_node() - On which node to begin search for a slab page
+ * cpuset_spread_node() - On which node to begin search for a page
*
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
* tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -3904,12 +3929,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
* is passed an offline node, it will fall back to the local node.
* See kmem_cache_alloc_node().
*/
-
static int cpuset_spread_node(int *rotor)
{
return *rotor = next_node_in(*rotor, current->mems_allowed);
}
+/**
+ * cpuset_mem_spread_node() - On which node to begin search for a file page
+ */
int cpuset_mem_spread_node(void)
{
if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
@@ -3919,6 +3946,9 @@ int cpuset_mem_spread_node(void)
return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
}
+/**
+ * cpuset_slab_spread_node() - On which node to begin search for a slab page
+ */
int cpuset_slab_spread_node(void)
{
if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
@@ -3927,7 +3957,6 @@ int cpuset_slab_spread_node(void)
return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
}
-
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
/**
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 793ecff29038..831f1f472bb8 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -26,7 +26,7 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
* rstat_cpu->updated_children list. See the comment on top of
* cgroup_rstat_cpu definition for details.
*/
-void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
+__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
{
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
unsigned long flags;
@@ -231,7 +231,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
*
* This function may block.
*/
-void cgroup_rstat_flush(struct cgroup *cgrp)
+__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
might_sleep();
diff --git a/kernel/compat.c b/kernel/compat.c
index 55551989d9da..fb50f29d9b36 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -152,7 +152,7 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
if (len & (sizeof(compat_ulong_t)-1))
return -EINVAL;
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 77978e372377..a09f1c19336a 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -510,7 +510,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
* In this we case we don't care about any concurrency/ordering.
*/
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
- atomic_set(&ct->state, state);
+ arch_atomic_set(&ct->state, state);
} else {
/*
* Even if context tracking is disabled on this CPU, because it's outside
@@ -527,7 +527,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
*/
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
/* Tracking for vtime only, no concurrent RCU EQS accounting */
- atomic_set(&ct->state, state);
+ arch_atomic_set(&ct->state, state);
} else {
/*
* Tracking for vtime and RCU EQS. Make sure we don't race
@@ -535,7 +535,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
* RCU only requires RCU_DYNTICKS_IDX increments to be fully
* ordered.
*/
- atomic_add(state, &ct->state);
+ arch_atomic_add(state, &ct->state);
}
}
}
@@ -630,12 +630,12 @@ void noinstr __ct_user_exit(enum ctx_state state)
* In this we case we don't care about any concurrency/ordering.
*/
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
- atomic_set(&ct->state, CONTEXT_KERNEL);
+ arch_atomic_set(&ct->state, CONTEXT_KERNEL);
} else {
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
/* Tracking for vtime only, no concurrent RCU EQS accounting */
- atomic_set(&ct->state, CONTEXT_KERNEL);
+ arch_atomic_set(&ct->state, CONTEXT_KERNEL);
} else {
/*
* Tracking for vtime and RCU EQS. Make sure we don't race
@@ -643,7 +643,7 @@ void noinstr __ct_user_exit(enum ctx_state state)
* RCU only requires RCU_DYNTICKS_IDX increments to be fully
* ordered.
*/
- atomic_sub(state, &ct->state);
+ arch_atomic_sub(state, &ct->state);
}
}
}
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index ba4ba71facf9..b0f0d15085db 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -30,16 +30,9 @@ static int cpu_pm_notify(enum cpu_pm_event event)
{
int ret;
- /*
- * This introduces a RCU read critical section, which could be
- * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know
- * this.
- */
- ct_irq_enter_irqson();
rcu_read_lock();
ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL);
rcu_read_unlock();
- ct_irq_exit_irqson();
return notifier_to_errno(ret);
}
@@ -49,11 +42,9 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev
unsigned long flags;
int ret;
- ct_irq_enter_irqson();
raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL);
raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
- ct_irq_exit_irqson();
return notifier_to_errno(ret);
}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 87ef6096823f..755f5f08ab38 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -455,8 +455,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, lru);
VMCOREINFO_OFFSET(page, _mapcount);
VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(page, compound_dtor);
- VMCOREINFO_OFFSET(page, compound_order);
+ VMCOREINFO_OFFSET(folio, _folio_dtor);
+ VMCOREINFO_OFFSET(folio, _folio_order);
VMCOREINFO_OFFSET(page, compound_head);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index a34c38bbe28f..03e3251cd9d2 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -156,14 +156,6 @@ setup_io_tlb_npages(char *str)
}
early_param("swiotlb", setup_io_tlb_npages);
-unsigned int swiotlb_max_segment(void)
-{
- if (!io_tlb_default_mem.nslabs)
- return 0;
- return rounddown(io_tlb_default_mem.nslabs << IO_TLB_SHIFT, PAGE_SIZE);
-}
-EXPORT_SYMBOL_GPL(swiotlb_max_segment);
-
unsigned long swiotlb_size_or_default(void)
{
return default_nslabs << IO_TLB_SHIFT;
@@ -300,7 +292,8 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
return;
}
-static void *swiotlb_memblock_alloc(unsigned long nslabs, unsigned int flags,
+static void __init *swiotlb_memblock_alloc(unsigned long nslabs,
+ unsigned int flags,
int (*remap)(void *tlb, unsigned long nslabs))
{
size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d56328e5080e..fb3e436bcd4a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2163,7 +2163,7 @@ static void perf_group_detach(struct perf_event *event)
/* Inherit group flags from the previous leader */
sibling->group_caps = event->group_caps;
- if (!RB_EMPTY_NODE(&event->group_node)) {
+ if (sibling->attach_state & PERF_ATTACH_CONTEXT) {
add_event_to_groups(sibling, event->ctx);
if (sibling->state == PERF_EVENT_STATE_ACTIVE)
@@ -3872,7 +3872,7 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
if (likely(!ctx->nr_events))
return;
- if (is_active ^ EVENT_TIME) {
+ if (!(is_active & EVENT_TIME)) {
/* start ctx time */
__update_context_time(ctx, false);
perf_cgroup_set_timestamp(cpuctx);
@@ -4813,19 +4813,17 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
epc = &cpc->epc;
-
+ raw_spin_lock_irq(&ctx->lock);
if (!epc->ctx) {
atomic_set(&epc->refcount, 1);
epc->embedded = 1;
- raw_spin_lock_irq(&ctx->lock);
list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
epc->ctx = ctx;
- raw_spin_unlock_irq(&ctx->lock);
} else {
WARN_ON_ONCE(epc->ctx != ctx);
atomic_inc(&epc->refcount);
}
-
+ raw_spin_unlock_irq(&ctx->lock);
return epc;
}
@@ -4896,33 +4894,30 @@ static void free_epc_rcu(struct rcu_head *head)
static void put_pmu_ctx(struct perf_event_pmu_context *epc)
{
+ struct perf_event_context *ctx = epc->ctx;
unsigned long flags;
- if (!atomic_dec_and_test(&epc->refcount))
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because of the call-site in _free_event()/put_event()
+ * which isn't always called under ctx->mutex.
+ */
+ if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
return;
- if (epc->ctx) {
- struct perf_event_context *ctx = epc->ctx;
+ WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
- /*
- * XXX
- *
- * lockdep_assert_held(&ctx->mutex);
- *
- * can't because of the call-site in _free_event()/put_event()
- * which isn't always called under ctx->mutex.
- */
-
- WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
- raw_spin_lock_irqsave(&ctx->lock, flags);
- list_del_init(&epc->pmu_ctx_entry);
- epc->ctx = NULL;
- raw_spin_unlock_irqrestore(&ctx->lock, flags);
- }
+ list_del_init(&epc->pmu_ctx_entry);
+ epc->ctx = NULL;
WARN_ON_ONCE(!list_empty(&epc->pinned_active));
WARN_ON_ONCE(!list_empty(&epc->flexible_active));
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
if (epc->embedded)
return;
@@ -6573,7 +6568,7 @@ aux_unlock:
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
*/
- vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
+ vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &perf_mmap_vmops;
if (event->pmu->event_mapped)
@@ -7046,13 +7041,20 @@ out_put:
ring_buffer_put(rb);
}
-static void __perf_event_header__init_id(struct perf_event_header *header,
- struct perf_sample_data *data,
+/*
+ * A set of common sample data types saved even for non-sample records
+ * when event->attr.sample_id_all is set.
+ */
+#define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \
+ PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \
+ PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER)
+
+static void __perf_event_header__init_id(struct perf_sample_data *data,
struct perf_event *event,
u64 sample_type)
{
data->type = event->attr.sample_type;
- header->size += event->id_header_size;
+ data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL;
if (sample_type & PERF_SAMPLE_TID) {
/* namespace issues */
@@ -7079,8 +7081,10 @@ void perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
{
- if (event->attr.sample_id_all)
- __perf_event_header__init_id(header, data, event, event->attr.sample_type);
+ if (event->attr.sample_id_all) {
+ header->size += event->id_header_size;
+ __perf_event_header__init_id(data, event, event->attr.sample_type);
+ }
}
static void __perf_event__output_id_sample(struct perf_output_handle *handle,
@@ -7310,7 +7314,7 @@ void perf_output_sample(struct perf_output_handle *handle,
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
- if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) {
+ if (data->br_stack) {
size_t size;
size = data->br_stack->nr
@@ -7554,83 +7558,68 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
return callchain ?: &__empty_callchain;
}
-void perf_prepare_sample(struct perf_event_header *header,
- struct perf_sample_data *data,
+static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
+{
+ return d * !!(flags & s);
+}
+
+void perf_prepare_sample(struct perf_sample_data *data,
struct perf_event *event,
struct pt_regs *regs)
{
u64 sample_type = event->attr.sample_type;
u64 filtered_sample_type;
- header->type = PERF_RECORD_SAMPLE;
- header->size = sizeof(*header) + event->header_size;
-
- header->misc = 0;
- header->misc |= perf_misc_flags(regs);
-
/*
- * Clear the sample flags that have already been done by the
- * PMU driver.
+ * Add the sample flags that are dependent to others. And clear the
+ * sample flags that have already been done by the PMU driver.
*/
- filtered_sample_type = sample_type & ~data->sample_flags;
- __perf_event_header__init_id(header, data, event, filtered_sample_type);
-
- if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
- data->ip = perf_instruction_pointer(regs);
-
- if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- int size = 1;
-
- if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
- data->callchain = perf_callchain(event, regs);
+ filtered_sample_type = sample_type;
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE,
+ PERF_SAMPLE_IP);
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
+ PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER,
+ PERF_SAMPLE_REGS_USER);
+ filtered_sample_type &= ~data->sample_flags;
- size += data->callchain->nr;
-
- header->size += size * sizeof(u64);
+ if (filtered_sample_type == 0) {
+ /* Make sure it has the correct data->type for output */
+ data->type = event->attr.sample_type;
+ return;
}
- if (sample_type & PERF_SAMPLE_RAW) {
- struct perf_raw_record *raw = data->raw;
- int size;
+ __perf_event_header__init_id(data, event, filtered_sample_type);
- if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) {
- struct perf_raw_frag *frag = &raw->frag;
- u32 sum = 0;
+ if (filtered_sample_type & PERF_SAMPLE_IP) {
+ data->ip = perf_instruction_pointer(regs);
+ data->sample_flags |= PERF_SAMPLE_IP;
+ }
- do {
- sum += frag->size;
- if (perf_raw_frag_last(frag))
- break;
- frag = frag->next;
- } while (1);
+ if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
+ perf_sample_save_callchain(data, event, regs);
- size = round_up(sum + sizeof(u32), sizeof(u64));
- raw->size = size - sizeof(u32);
- frag->pad = raw->size - sum;
- } else {
- size = sizeof(u64);
- data->raw = NULL;
- }
-
- header->size += size;
+ if (filtered_sample_type & PERF_SAMPLE_RAW) {
+ data->raw = NULL;
+ data->dyn_size += sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_RAW;
}
- if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
- int size = sizeof(u64); /* nr */
- if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) {
- if (branch_sample_hw_index(event))
- size += sizeof(u64);
-
- size += data->br_stack->nr
- * sizeof(struct perf_branch_entry);
- }
- header->size += size;
+ if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ data->br_stack = NULL;
+ data->dyn_size += sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
}
- if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
+ if (filtered_sample_type & PERF_SAMPLE_REGS_USER)
perf_sample_regs_user(&data->regs_user, regs);
- if (sample_type & PERF_SAMPLE_REGS_USER) {
+ /*
+ * It cannot use the filtered_sample_type here as REGS_USER can be set
+ * by STACK_USER (using __cond_set() above) and we don't want to update
+ * the dyn_size if it's not requested by users.
+ */
+ if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -7639,10 +7628,11 @@ void perf_prepare_sample(struct perf_event_header *header,
size += hweight64(mask) * sizeof(u64);
}
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_REGS_USER;
}
- if (sample_type & PERF_SAMPLE_STACK_USER) {
+ if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
/*
* Either we need PERF_SAMPLE_STACK_USER bit to be always
* processed as the last one or have additional check added
@@ -7650,9 +7640,10 @@ void perf_prepare_sample(struct perf_event_header *header,
* up the rest of the sample size.
*/
u16 stack_size = event->attr.sample_stack_user;
+ u16 header_size = perf_sample_data_size(data, event);
u16 size = sizeof(u64);
- stack_size = perf_sample_ustack_size(stack_size, header->size,
+ stack_size = perf_sample_ustack_size(stack_size, header_size,
data->regs_user.regs);
/*
@@ -7664,24 +7655,31 @@ void perf_prepare_sample(struct perf_event_header *header,
size += sizeof(u64) + stack_size;
data->stack_user_size = stack_size;
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_STACK_USER;
}
- if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
data->weight.full = 0;
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
- if (filtered_sample_type & PERF_SAMPLE_DATA_SRC)
+ if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
data->data_src.val = PERF_MEM_NA;
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
- if (filtered_sample_type & PERF_SAMPLE_TRANSACTION)
+ if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
data->txn = 0;
+ data->sample_flags |= PERF_SAMPLE_TRANSACTION;
+ }
- if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) {
- if (filtered_sample_type & PERF_SAMPLE_ADDR)
- data->addr = 0;
+ if (filtered_sample_type & PERF_SAMPLE_ADDR) {
+ data->addr = 0;
+ data->sample_flags |= PERF_SAMPLE_ADDR;
}
- if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -7693,20 +7691,23 @@ void perf_prepare_sample(struct perf_event_header *header,
size += hweight64(mask) * sizeof(u64);
}
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_REGS_INTR;
}
- if (sample_type & PERF_SAMPLE_PHYS_ADDR &&
- filtered_sample_type & PERF_SAMPLE_PHYS_ADDR)
+ if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
data->phys_addr = perf_virt_to_phys(data->addr);
+ data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
+ }
#ifdef CONFIG_CGROUP_PERF
- if (sample_type & PERF_SAMPLE_CGROUP) {
+ if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
struct cgroup *cgrp;
/* protected by RCU */
cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
data->cgroup = cgroup_id(cgrp);
+ data->sample_flags |= PERF_SAMPLE_CGROUP;
}
#endif
@@ -7715,16 +7716,21 @@ void perf_prepare_sample(struct perf_event_header *header,
* require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
* but the value will not dump to the userspace.
*/
- if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
data->data_page_size = perf_get_page_size(data->addr);
+ data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
+ }
- if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
data->code_page_size = perf_get_page_size(data->ip);
+ data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
+ }
- if (sample_type & PERF_SAMPLE_AUX) {
+ if (filtered_sample_type & PERF_SAMPLE_AUX) {
u64 size;
+ u16 header_size = perf_sample_data_size(data, event);
- header->size += sizeof(u64); /* size */
+ header_size += sizeof(u64); /* size */
/*
* Given the 16bit nature of header::size, an AUX sample can
@@ -7732,14 +7738,26 @@ void perf_prepare_sample(struct perf_event_header *header,
* Make sure this doesn't happen by using up to U16_MAX bytes
* per sample in total (rounded down to 8 byte boundary).
*/
- size = min_t(size_t, U16_MAX - header->size,
+ size = min_t(size_t, U16_MAX - header_size,
event->attr.aux_sample_size);
size = rounddown(size, 8);
size = perf_prepare_sample_aux(event, data, size);
- WARN_ON_ONCE(size + header->size > U16_MAX);
- header->size += size;
+ WARN_ON_ONCE(size + header_size > U16_MAX);
+ data->dyn_size += size + sizeof(u64); /* size above */
+ data->sample_flags |= PERF_SAMPLE_AUX;
}
+}
+
+void perf_prepare_header(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event,
+ struct pt_regs *regs)
+{
+ header->type = PERF_RECORD_SAMPLE;
+ header->size = perf_sample_data_size(data, event);
+ header->misc = perf_misc_flags(regs);
+
/*
* If you're adding more sample types here, you likely need to do
* something about the overflowing header::size, like repurpose the
@@ -7767,7 +7785,8 @@ __perf_event_output(struct perf_event *event,
/* protect the callchain buffers */
rcu_read_lock();
- perf_prepare_sample(&header, data, event, regs);
+ perf_prepare_sample(data, event, regs);
+ perf_prepare_header(&header, data, event, regs);
err = output_begin(&handle, data, event, header.size);
if (err)
@@ -9168,7 +9187,7 @@ static void perf_event_bpf_output(struct perf_event *event, void *data)
perf_event_header__init_id(&bpf_event->event_id.header,
&sample, event);
- ret = perf_output_begin(&handle, data, event,
+ ret = perf_output_begin(&handle, &sample, event,
bpf_event->event_id.header.size);
if (ret)
return;
@@ -9399,6 +9418,7 @@ void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
perf_output_end(&handle);
}
+EXPORT_SYMBOL_GPL(perf_report_aux_output_id);
static int
__perf_event_account_interrupt(struct perf_event *event, int throttle)
@@ -10125,8 +10145,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
};
perf_sample_data_init(&data, 0, 0);
- data.raw = &raw;
- data.sample_flags |= PERF_SAMPLE_RAW;
+ perf_sample_save_raw_data(&data, &raw);
perf_trace_buf_update(record, event_type);
@@ -10333,13 +10352,7 @@ static void bpf_overflow_handler(struct perf_event *event,
rcu_read_lock();
prog = READ_ONCE(event->prog);
if (prog) {
- if (prog->call_get_stack &&
- (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
- !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) {
- data->callchain = perf_callchain(event, regs);
- data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
- }
-
+ perf_prepare_sample(data, event, regs);
ret = bpf_prog_run(prog, &ctx);
}
rcu_read_unlock();
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d9e357b7e17c..59887c69d54c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -22,7 +22,6 @@
#include <linux/swap.h> /* folio_free_swap */
#include <linux/ptrace.h> /* user_enable_single_step */
#include <linux/kdebug.h> /* notifier mechanism */
-#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
@@ -161,7 +160,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
int err;
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
addr + PAGE_SIZE);
if (new_page) {
@@ -1352,7 +1351,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
}
/*
- * Called from mmap_region/vma_adjust with mm->mmap_lock acquired.
+ * Called from mmap_region/vma_merge with mm->mmap_lock acquired.
*
* Currently we ignore all errors and always return 0, the callers
* can't handle the failure anyway.
diff --git a/kernel/exit.c b/kernel/exit.c
index 15dc2ec80c46..f2afdb0add7c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -807,6 +807,8 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
+ WARN_ON(irqs_disabled());
+
synchronize_group_exit(tsk, code);
WARN_ON(tsk->plug);
@@ -938,6 +940,11 @@ void __noreturn make_task_dead(int signr)
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
+ if (unlikely(irqs_disabled())) {
+ pr_info("note: %s[%d] exited with irqs disabled\n",
+ current->comm, task_pid_nr(current));
+ local_irq_enable();
+ }
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
@@ -1898,7 +1905,14 @@ bool thread_group_exited(struct pid *pid)
}
EXPORT_SYMBOL(thread_group_exited);
-__weak void abort(void)
+/*
+ * This needs to be __function_aligned as GCC implicitly makes any
+ * implementation of abort() cold and drops alignment specified by
+ * -falign-functions=N.
+ *
+ * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11
+ */
+__weak __function_aligned void abort(void)
{
BUG();
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index a7ccd2930c5f..d971a0189319 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -163,10 +163,7 @@ static void fei_debugfs_add_attr(struct fei_attr *attr)
static void fei_debugfs_remove_attr(struct fei_attr *attr)
{
- struct dentry *dir;
-
- dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir);
- debugfs_remove_recursive(dir);
+ debugfs_lookup_and_remove(attr->kp.symbol_name, fei_debugfs_dir);
}
static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
diff --git a/kernel/fork.c b/kernel/fork.c
index 9f7fe3541897..d8cda4c6de6c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -472,7 +472,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
* orig->shared.rb may be modified concurrently, but the clone
* will be reinitialized.
*/
- *new = data_race(*orig);
+ data_race(memcpy(new, orig, sizeof(*new)));
INIT_LIST_HEAD(&new->anon_vma_chain);
dup_anon_vma_name(orig, new);
}
@@ -585,8 +585,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
int retval;
unsigned long charge = 0;
LIST_HEAD(uf);
- MA_STATE(old_mas, &oldmm->mm_mt, 0, 0);
- MA_STATE(mas, &mm->mm_mt, 0, 0);
+ VMA_ITERATOR(old_vmi, oldmm, 0);
+ VMA_ITERATOR(vmi, mm, 0);
uprobe_start_dup_mmap();
if (mmap_write_lock_killable(oldmm)) {
@@ -613,11 +613,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
goto out;
khugepaged_fork(mm, oldmm);
- retval = mas_expected_entries(&mas, oldmm->map_count);
+ retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);
if (retval)
goto out;
- mas_for_each(&old_mas, mpnt, ULONG_MAX) {
+ for_each_vma(old_vmi, mpnt) {
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -659,7 +659,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->anon_vma = NULL;
} else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
+ vm_flags_clear(tmp, VM_LOCKED_MASK);
file = tmp->vm_file;
if (file) {
struct address_space *mapping = file->f_mapping;
@@ -683,11 +683,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
hugetlb_dup_vma_private(tmp);
/* Link the vma into the MT */
- mas.index = tmp->vm_start;
- mas.last = tmp->vm_end - 1;
- mas_store(&mas, tmp);
- if (mas_is_err(&mas))
- goto fail_nomem_mas_store;
+ if (vma_iter_bulk_store(&vmi, tmp))
+ goto fail_nomem_vmi_store;
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
@@ -702,7 +699,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
loop_out:
- mas_destroy(&mas);
+ vma_iter_free(&vmi);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
@@ -712,7 +709,7 @@ fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
-fail_nomem_mas_store:
+fail_nomem_vmi_store:
unlink_anon_vmas(tmp);
fail_nomem_anon_vma_fork:
mpol_put(vma_policy(tmp));
@@ -1044,7 +1041,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#endif
#ifdef CONFIG_BLK_CGROUP
- tsk->throttle_queue = NULL;
+ tsk->throttle_disk = NULL;
tsk->use_memdelay = 0;
#endif
@@ -1060,6 +1057,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->reported_split_lock = 0;
#endif
+#ifdef CONFIG_SCHED_MM_CID
+ tsk->mm_cid = -1;
+ tsk->mm_cid_active = 0;
+#endif
return tsk;
free_stack:
@@ -1169,6 +1170,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
+ mm_init_cid(mm);
return mm;
fail_pcpu:
@@ -1601,6 +1603,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
tsk->mm = mm;
tsk->active_mm = mm;
+ sched_mm_cid_fork(tsk);
return 0;
}
@@ -2933,7 +2936,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
* - make the CLONE_DETACHED bit reusable for clone3
* - make the CSIGNAL bits reusable for clone3
*/
- if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
+ if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
return false;
if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
@@ -3034,7 +3037,7 @@ void __init mm_cache_init(void)
* dynamically sized based on the maximum CPU number this system
* can have, taking hotplug into account (nr_cpu_ids).
*/
- mm_size = sizeof(struct mm_struct) + cpumask_size();
+ mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
mm_cachep = kmem_cache_create_usercopy("mm_struct",
mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 81b97f0f6556..1ef9a87511f5 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -7,7 +7,7 @@ set -e
sfile="$(readlink -f "$0")"
outdir="$(pwd)"
tarfile=$1
-cpio_dir=$outdir/$tarfile.tmp
+cpio_dir=$outdir/${tarfile%/*}/.tmp_cpio_dir
dir_list="
include/
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index c71889f3f3fc..322813366c6c 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -142,6 +142,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
if (sysctl_hung_task_all_cpu_backtrace)
hung_task_show_all_bt = true;
+ if (!sysctl_hung_task_warnings)
+ pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n");
}
touch_nmi_watchdog();
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index b64c44ae4c25..2531f3496ab6 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -86,6 +86,11 @@ config GENERIC_IRQ_IPI
depends on SMP
select IRQ_DOMAIN_HIERARCHY
+# Generic IRQ IPI Mux support
+config GENERIC_IRQ_IPI_MUX
+ bool
+ depends on SMP
+
# Generic MSI hierarchical interrupt domain support
config GENERIC_MSI_IRQ
bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index b4f53717d143..f19d3080bf11 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
obj-$(CONFIG_PM_SLEEP) += pm.o
obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
+obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
obj-$(CONFIG_SMP) += affinity.o
obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index d9a5c1d65a79..44a4eba80315 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -7,398 +7,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/cpu.h>
-#include <linux/sort.h>
-
-static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
- unsigned int cpus_per_vec)
-{
- const struct cpumask *siblmsk;
- int cpu, sibl;
-
- for ( ; cpus_per_vec > 0; ) {
- cpu = cpumask_first(nmsk);
-
- /* Should not happen, but I'm too lazy to think about it */
- if (cpu >= nr_cpu_ids)
- return;
-
- cpumask_clear_cpu(cpu, nmsk);
- cpumask_set_cpu(cpu, irqmsk);
- cpus_per_vec--;
-
- /* If the cpu has siblings, use them first */
- siblmsk = topology_sibling_cpumask(cpu);
- for (sibl = -1; cpus_per_vec > 0; ) {
- sibl = cpumask_next(sibl, siblmsk);
- if (sibl >= nr_cpu_ids)
- break;
- if (!cpumask_test_and_clear_cpu(sibl, nmsk))
- continue;
- cpumask_set_cpu(sibl, irqmsk);
- cpus_per_vec--;
- }
- }
-}
-
-static cpumask_var_t *alloc_node_to_cpumask(void)
-{
- cpumask_var_t *masks;
- int node;
-
- masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
- if (!masks)
- return NULL;
-
- for (node = 0; node < nr_node_ids; node++) {
- if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
- goto out_unwind;
- }
-
- return masks;
-
-out_unwind:
- while (--node >= 0)
- free_cpumask_var(masks[node]);
- kfree(masks);
- return NULL;
-}
-
-static void free_node_to_cpumask(cpumask_var_t *masks)
-{
- int node;
-
- for (node = 0; node < nr_node_ids; node++)
- free_cpumask_var(masks[node]);
- kfree(masks);
-}
-
-static void build_node_to_cpumask(cpumask_var_t *masks)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
-}
-
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
- const struct cpumask *mask, nodemask_t *nodemsk)
-{
- int n, nodes = 0;
-
- /* Calculate the number of nodes in the supplied affinity mask */
- for_each_node(n) {
- if (cpumask_intersects(mask, node_to_cpumask[n])) {
- node_set(n, *nodemsk);
- nodes++;
- }
- }
- return nodes;
-}
-
-struct node_vectors {
- unsigned id;
-
- union {
- unsigned nvectors;
- unsigned ncpus;
- };
-};
-
-static int ncpus_cmp_func(const void *l, const void *r)
-{
- const struct node_vectors *ln = l;
- const struct node_vectors *rn = r;
-
- return ln->ncpus - rn->ncpus;
-}
-
-/*
- * Allocate vector number for each node, so that for each node:
- *
- * 1) the allocated number is >= 1
- *
- * 2) the allocated numbver is <= active CPU number of this node
- *
- * The actual allocated total vectors may be less than @numvecs when
- * active total CPU number is less than @numvecs.
- *
- * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
- * for each node.
- */
-static void alloc_nodes_vectors(unsigned int numvecs,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- const nodemask_t nodemsk,
- struct cpumask *nmsk,
- struct node_vectors *node_vectors)
-{
- unsigned n, remaining_ncpus = 0;
-
- for (n = 0; n < nr_node_ids; n++) {
- node_vectors[n].id = n;
- node_vectors[n].ncpus = UINT_MAX;
- }
-
- for_each_node_mask(n, nodemsk) {
- unsigned ncpus;
-
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
- ncpus = cpumask_weight(nmsk);
-
- if (!ncpus)
- continue;
- remaining_ncpus += ncpus;
- node_vectors[n].ncpus = ncpus;
- }
-
- numvecs = min_t(unsigned, remaining_ncpus, numvecs);
-
- sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
- ncpus_cmp_func, NULL);
-
- /*
- * Allocate vectors for each node according to the ratio of this
- * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
- * bigger than number of active numa nodes. Always start the
- * allocation from the node with minimized nr_cpus.
- *
- * This way guarantees that each active node gets allocated at
- * least one vector, and the theory is simple: over-allocation
- * is only done when this node is assigned by one vector, so
- * other nodes will be allocated >= 1 vector, since 'numvecs' is
- * bigger than number of numa nodes.
- *
- * One perfect invariant is that number of allocated vectors for
- * each node is <= CPU count of this node:
- *
- * 1) suppose there are two nodes: A and B
- * ncpu(X) is CPU count of node X
- * vecs(X) is the vector count allocated to node X via this
- * algorithm
- *
- * ncpu(A) <= ncpu(B)
- * ncpu(A) + ncpu(B) = N
- * vecs(A) + vecs(B) = V
- *
- * vecs(A) = max(1, round_down(V * ncpu(A) / N))
- * vecs(B) = V - vecs(A)
- *
- * both N and V are integer, and 2 <= V <= N, suppose
- * V = N - delta, and 0 <= delta <= N - 2
- *
- * 2) obviously vecs(A) <= ncpu(A) because:
- *
- * if vecs(A) is 1, then vecs(A) <= ncpu(A) given
- * ncpu(A) >= 1
- *
- * otherwise,
- * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
- *
- * 3) prove how vecs(B) <= ncpu(B):
- *
- * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
- * over-allocated, so vecs(B) <= ncpu(B),
- *
- * otherwise:
- *
- * vecs(A) =
- * round_down(V * ncpu(A) / N) =
- * round_down((N - delta) * ncpu(A) / N) =
- * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
- * round_down((N * ncpu(A) - delta * N) / N) =
- * cpu(A) - delta
- *
- * then:
- *
- * vecs(A) - V >= ncpu(A) - delta - V
- * =>
- * V - vecs(A) <= V + delta - ncpu(A)
- * =>
- * vecs(B) <= N - ncpu(A)
- * =>
- * vecs(B) <= cpu(B)
- *
- * For nodes >= 3, it can be thought as one node and another big
- * node given that is exactly what this algorithm is implemented,
- * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
- * finally for each node X: vecs(X) <= ncpu(X).
- *
- */
- for (n = 0; n < nr_node_ids; n++) {
- unsigned nvectors, ncpus;
-
- if (node_vectors[n].ncpus == UINT_MAX)
- continue;
-
- WARN_ON_ONCE(numvecs == 0);
-
- ncpus = node_vectors[n].ncpus;
- nvectors = max_t(unsigned, 1,
- numvecs * ncpus / remaining_ncpus);
- WARN_ON_ONCE(nvectors > ncpus);
-
- node_vectors[n].nvectors = nvectors;
-
- remaining_ncpus -= ncpus;
- numvecs -= nvectors;
- }
-}
-
-static int __irq_build_affinity_masks(unsigned int startvec,
- unsigned int numvecs,
- unsigned int firstvec,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- struct cpumask *nmsk,
- struct irq_affinity_desc *masks)
-{
- unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
- unsigned int last_affv = firstvec + numvecs;
- unsigned int curvec = startvec;
- nodemask_t nodemsk = NODE_MASK_NONE;
- struct node_vectors *node_vectors;
-
- if (cpumask_empty(cpu_mask))
- return 0;
-
- nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
-
- /*
- * If the number of nodes in the mask is greater than or equal the
- * number of vectors we just spread the vectors across the nodes.
- */
- if (numvecs <= nodes) {
- for_each_node_mask(n, nodemsk) {
- /* Ensure that only CPUs which are in both masks are set */
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
- cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
- if (++curvec == last_affv)
- curvec = firstvec;
- }
- return numvecs;
- }
-
- node_vectors = kcalloc(nr_node_ids,
- sizeof(struct node_vectors),
- GFP_KERNEL);
- if (!node_vectors)
- return -ENOMEM;
-
- /* allocate vector number for each node */
- alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
- nodemsk, nmsk, node_vectors);
-
- for (i = 0; i < nr_node_ids; i++) {
- unsigned int ncpus, v;
- struct node_vectors *nv = &node_vectors[i];
-
- if (nv->nvectors == UINT_MAX)
- continue;
-
- /* Get the cpus on this node which are in the mask */
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
- ncpus = cpumask_weight(nmsk);
- if (!ncpus)
- continue;
-
- WARN_ON_ONCE(nv->nvectors > ncpus);
-
- /* Account for rounding errors */
- extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
-
- /* Spread allocated vectors on CPUs of the current node */
- for (v = 0; v < nv->nvectors; v++, curvec++) {
- cpus_per_vec = ncpus / nv->nvectors;
-
- /* Account for extra vectors to compensate rounding errors */
- if (extra_vecs) {
- cpus_per_vec++;
- --extra_vecs;
- }
-
- /*
- * wrapping has to be considered given 'startvec'
- * may start anywhere
- */
- if (curvec >= last_affv)
- curvec = firstvec;
- irq_spread_init_one(&masks[curvec].mask, nmsk,
- cpus_per_vec);
- }
- done += nv->nvectors;
- }
- kfree(node_vectors);
- return done;
-}
-
-/*
- * build affinity in two stages:
- * 1) spread present CPU on these vectors
- * 2) spread other possible CPUs on these vectors
- */
-static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
- unsigned int firstvec,
- struct irq_affinity_desc *masks)
-{
- unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
- cpumask_var_t *node_to_cpumask;
- cpumask_var_t nmsk, npresmsk;
- int ret = -ENOMEM;
-
- if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
- return ret;
-
- if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
- goto fail_nmsk;
-
- node_to_cpumask = alloc_node_to_cpumask();
- if (!node_to_cpumask)
- goto fail_npresmsk;
-
- /* Stabilize the cpumasks */
- cpus_read_lock();
- build_node_to_cpumask(node_to_cpumask);
-
- /* Spread on present CPUs starting from affd->pre_vectors */
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, cpu_present_mask,
- nmsk, masks);
- if (ret < 0)
- goto fail_build_affinity;
- nr_present = ret;
-
- /*
- * Spread on non present CPUs starting from the next vector to be
- * handled. If the spreading of present CPUs already exhausted the
- * vector space, assign the non present CPUs to the already spread
- * out vectors.
- */
- if (nr_present >= numvecs)
- curvec = firstvec;
- else
- curvec = firstvec + nr_present;
- cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, npresmsk, nmsk,
- masks);
- if (ret >= 0)
- nr_others = ret;
-
- fail_build_affinity:
- cpus_read_unlock();
-
- if (ret >= 0)
- WARN_ON(nr_present + nr_others < numvecs);
-
- free_node_to_cpumask(node_to_cpumask);
-
- fail_npresmsk:
- free_cpumask_var(npresmsk);
-
- fail_nmsk:
- free_cpumask_var(nmsk);
- return ret < 0 ? ret : 0;
-}
+#include <linux/group_cpus.h>
static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
{
@@ -461,14 +70,18 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
*/
for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
unsigned int this_vecs = affd->set_size[i];
- int ret;
+ int j;
+ struct cpumask *result = group_cpus_evenly(this_vecs);
- ret = irq_build_affinity_masks(curvec, this_vecs,
- curvec, masks);
- if (ret) {
+ if (!result) {
kfree(masks);
return NULL;
}
+
+ for (j = 0; j < this_vecs; j++)
+ cpumask_copy(&masks[curvec + j].mask, &result[j]);
+ kfree(result);
+
curvec += this_vecs;
usedvecs += this_vecs;
}
diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
new file mode 100644
index 000000000000..fa4fc18c6131
--- /dev/null
+++ b/kernel/irq/ipi-mux.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Multiplex several virtual IPIs over a single HW IPI.
+ *
+ * Copyright The Asahi Linux Contributors
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "ipi-mux: " fmt
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
+#include <linux/jump_label.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+
+struct ipi_mux_cpu {
+ atomic_t enable;
+ atomic_t bits;
+};
+
+static struct ipi_mux_cpu __percpu *ipi_mux_pcpu;
+static struct irq_domain *ipi_mux_domain;
+static void (*ipi_mux_send)(unsigned int cpu);
+
+static void ipi_mux_mask(struct irq_data *d)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+
+ atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable);
+}
+
+static void ipi_mux_unmask(struct irq_data *d)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ u32 ibit = BIT(irqd_to_hwirq(d));
+
+ atomic_or(ibit, &icpu->enable);
+
+ /*
+ * The atomic_or() above must complete before the atomic_read()
+ * below to avoid racing ipi_mux_send_mask().
+ */
+ smp_mb__after_atomic();
+
+ /* If a pending IPI was unmasked, raise a parent IPI immediately. */
+ if (atomic_read(&icpu->bits) & ibit)
+ ipi_mux_send(smp_processor_id());
+}
+
+static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ u32 ibit = BIT(irqd_to_hwirq(d));
+ unsigned long pending;
+ int cpu;
+
+ for_each_cpu(cpu, mask) {
+ icpu = per_cpu_ptr(ipi_mux_pcpu, cpu);
+
+ /*
+ * This sequence is the mirror of the one in ipi_mux_unmask();
+ * see the comment there. Additionally, release semantics
+ * ensure that the vIPI flag set is ordered after any shared
+ * memory accesses that precede it. This therefore also pairs
+ * with the atomic_fetch_andnot in ipi_mux_process().
+ */
+ pending = atomic_fetch_or_release(ibit, &icpu->bits);
+
+ /*
+ * The atomic_fetch_or_release() above must complete
+ * before the atomic_read() below to avoid racing with
+ * ipi_mux_unmask().
+ */
+ smp_mb__after_atomic();
+
+ /*
+ * The flag writes must complete before the physical IPI is
+ * issued to another CPU. This is implied by the control
+ * dependency on the result of atomic_read() below, which is
+ * itself already ordered after the vIPI flag write.
+ */
+ if (!(pending & ibit) && (atomic_read(&icpu->enable) & ibit))
+ ipi_mux_send(cpu);
+ }
+}
+
+static const struct irq_chip ipi_mux_chip = {
+ .name = "IPI Mux",
+ .irq_mask = ipi_mux_mask,
+ .irq_unmask = ipi_mux_unmask,
+ .ipi_send_mask = ipi_mux_send_mask,
+};
+
+static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_set_percpu_devid(virq + i);
+ irq_domain_set_info(d, virq + i, i, &ipi_mux_chip, NULL,
+ handle_percpu_devid_irq, NULL, NULL);
+ }
+
+ return 0;
+}
+
+static const struct irq_domain_ops ipi_mux_domain_ops = {
+ .alloc = ipi_mux_domain_alloc,
+ .free = irq_domain_free_irqs_top,
+};
+
+/**
+ * ipi_mux_process - Process multiplexed virtual IPIs
+ */
+void ipi_mux_process(void)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ irq_hw_number_t hwirq;
+ unsigned long ipis;
+ unsigned int en;
+
+ /*
+ * Reading enable mask does not need to be ordered as long as
+ * this function is called from interrupt handler because only
+ * the CPU itself can change it's own enable mask.
+ */
+ en = atomic_read(&icpu->enable);
+
+ /*
+ * Clear the IPIs we are about to handle. This pairs with the
+ * atomic_fetch_or_release() in ipi_mux_send_mask().
+ */
+ ipis = atomic_fetch_andnot(en, &icpu->bits) & en;
+
+ for_each_set_bit(hwirq, &ipis, BITS_PER_TYPE(int))
+ generic_handle_domain_irq(ipi_mux_domain, hwirq);
+}
+
+/**
+ * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
+ * parent IPI.
+ * @nr_ipi: number of virtual IPIs to create. This should
+ * be <= BITS_PER_TYPE(int)
+ * @mux_send: callback to trigger parent IPI for a particular CPU
+ *
+ * Returns first virq of the newly created virtual IPIs upon success
+ * or <=0 upon failure
+ */
+int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu))
+{
+ struct fwnode_handle *fwnode;
+ struct irq_domain *domain;
+ int rc;
+
+ if (ipi_mux_domain)
+ return -EEXIST;
+
+ if (BITS_PER_TYPE(int) < nr_ipi || !mux_send)
+ return -EINVAL;
+
+ ipi_mux_pcpu = alloc_percpu(typeof(*ipi_mux_pcpu));
+ if (!ipi_mux_pcpu)
+ return -ENOMEM;
+
+ fwnode = irq_domain_alloc_named_fwnode("IPI-Mux");
+ if (!fwnode) {
+ pr_err("unable to create IPI Mux fwnode\n");
+ rc = -ENOMEM;
+ goto fail_free_cpu;
+ }
+
+ domain = irq_domain_create_linear(fwnode, nr_ipi,
+ &ipi_mux_domain_ops, NULL);
+ if (!domain) {
+ pr_err("unable to add IPI Mux domain\n");
+ rc = -ENOMEM;
+ goto fail_free_fwnode;
+ }
+
+ domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE;
+ irq_domain_update_bus_token(domain, DOMAIN_BUS_IPI);
+
+ rc = irq_domain_alloc_irqs(domain, nr_ipi, NUMA_NO_NODE, NULL);
+ if (rc <= 0) {
+ pr_err("unable to alloc IRQs from IPI Mux domain\n");
+ goto fail_free_domain;
+ }
+
+ ipi_mux_domain = domain;
+ ipi_mux_send = mux_send;
+
+ return rc;
+
+fail_free_domain:
+ irq_domain_remove(domain);
+fail_free_fwnode:
+ irq_domain_free_fwnode(fwnode);
+fail_free_cpu:
+ free_percpu(ipi_mux_pcpu);
+ return rc;
+}
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index bbd945bacef0..961d4af76af3 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -188,9 +188,9 @@ EXPORT_SYMBOL_GPL(ipi_get_hwirq);
static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
const struct cpumask *dest, unsigned int cpu)
{
- const struct cpumask *ipimask = irq_data_get_affinity_mask(data);
+ const struct cpumask *ipimask;
- if (!chip || !ipimask)
+ if (!chip || !data)
return -EINVAL;
if (!chip->ipi_send_single && !chip->ipi_send_mask)
@@ -199,6 +199,10 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
if (cpu >= nr_cpu_ids)
return -EINVAL;
+ ipimask = irq_data_get_affinity_mask(data);
+ if (!ipimask)
+ return -EINVAL;
+
if (dest) {
if (!cpumask_subset(dest, ipimask))
return -EINVAL;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index fd0996274401..240e145e969f 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -277,7 +277,7 @@ static struct attribute *irq_attrs[] = {
};
ATTRIBUTE_GROUPS(irq);
-static struct kobj_type irq_kobj_type = {
+static const struct kobj_type irq_kobj_type = {
.release = irq_kobj_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = irq_groups,
@@ -335,7 +335,7 @@ postcore_initcall(irq_sysfs_init);
#else /* !CONFIG_SYSFS */
-static struct kobj_type irq_kobj_type = {
+static const struct kobj_type irq_kobj_type = {
.release = irq_kobj_release,
};
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8fe1da9614ee..f34760a1e222 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -25,6 +25,9 @@ static DEFINE_MUTEX(irq_domain_mutex);
static struct irq_domain *irq_default_domain;
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity);
static void irq_domain_check_hierarchy(struct irq_domain *domain);
struct irqchip_fwid {
@@ -114,7 +117,7 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
{
struct irqchip_fwid *fwid;
- if (WARN_ON(!is_fwnode_irqchip(fwnode)))
+ if (!fwnode || WARN_ON(!is_fwnode_irqchip(fwnode)))
return;
fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
@@ -123,23 +126,12 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
}
EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
-/**
- * __irq_domain_add() - Allocate a new irq_domain data structure
- * @fwnode: firmware node for the interrupt controller
- * @size: Size of linear map; 0 for radix mapping only
- * @hwirq_max: Maximum number of interrupts supported by controller
- * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
- * direct mapping
- * @ops: domain callbacks
- * @host_data: Controller private data pointer
- *
- * Allocates and initializes an irq_domain structure.
- * Returns pointer to IRQ domain, or NULL on failure.
- */
-struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size,
- irq_hw_number_t hwirq_max, int direct_max,
- const struct irq_domain_ops *ops,
- void *host_data)
+static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode,
+ unsigned int size,
+ irq_hw_number_t hwirq_max,
+ int direct_max,
+ const struct irq_domain_ops *ops,
+ void *host_data)
{
struct irqchip_fwid *fwid;
struct irq_domain *domain;
@@ -214,25 +206,66 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int s
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
- mutex_init(&domain->revmap_mutex);
domain->ops = ops;
domain->host_data = host_data;
domain->hwirq_max = hwirq_max;
- if (direct_max) {
+ if (direct_max)
domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP;
- }
domain->revmap_size = size;
+ /*
+ * Hierarchical domains use the domain lock of the root domain
+ * (innermost domain).
+ *
+ * For non-hierarchical domains (as for root domains), the root
+ * pointer is set to the domain itself so that &domain->root->mutex
+ * always points to the right lock.
+ */
+ mutex_init(&domain->mutex);
+ domain->root = domain;
+
irq_domain_check_hierarchy(domain);
+ return domain;
+}
+
+static void __irq_domain_publish(struct irq_domain *domain)
+{
mutex_lock(&irq_domain_mutex);
debugfs_add_domain_dir(domain);
list_add(&domain->link, &irq_domain_list);
mutex_unlock(&irq_domain_mutex);
pr_debug("Added domain %s\n", domain->name);
+}
+
+/**
+ * __irq_domain_add() - Allocate a new irq_domain data structure
+ * @fwnode: firmware node for the interrupt controller
+ * @size: Size of linear map; 0 for radix mapping only
+ * @hwirq_max: Maximum number of interrupts supported by controller
+ * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
+ * direct mapping
+ * @ops: domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * Allocates and initializes an irq_domain structure.
+ * Returns pointer to IRQ domain, or NULL on failure.
+ */
+struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size,
+ irq_hw_number_t hwirq_max, int direct_max,
+ const struct irq_domain_ops *ops,
+ void *host_data)
+{
+ struct irq_domain *domain;
+
+ domain = __irq_domain_create(fwnode, size, hwirq_max, direct_max,
+ ops, host_data);
+ if (domain)
+ __irq_domain_publish(domain);
+
return domain;
}
EXPORT_SYMBOL_GPL(__irq_domain_add);
@@ -437,31 +470,6 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
EXPORT_SYMBOL_GPL(irq_find_matching_fwspec);
/**
- * irq_domain_check_msi_remap - Check whether all MSI irq domains implement
- * IRQ remapping
- *
- * Return: false if any MSI irq domain does not support IRQ remapping,
- * true otherwise (including if there is no MSI irq domain)
- */
-bool irq_domain_check_msi_remap(void)
-{
- struct irq_domain *h;
- bool ret = true;
-
- mutex_lock(&irq_domain_mutex);
- list_for_each_entry(h, &irq_domain_list, link) {
- if (irq_domain_is_msi(h) &&
- !irq_domain_hierarchical_is_msi_remap(h)) {
- ret = false;
- break;
- }
- }
- mutex_unlock(&irq_domain_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(irq_domain_check_msi_remap);
-
-/**
* irq_set_default_host() - Set a "default" irq domain
* @domain: default domain pointer
*
@@ -502,30 +510,34 @@ static bool irq_domain_is_nomap(struct irq_domain *domain)
static void irq_domain_clear_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
+ lockdep_assert_held(&domain->root->mutex);
+
if (irq_domain_is_nomap(domain))
return;
- mutex_lock(&domain->revmap_mutex);
if (hwirq < domain->revmap_size)
rcu_assign_pointer(domain->revmap[hwirq], NULL);
else
radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&domain->revmap_mutex);
}
static void irq_domain_set_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq,
struct irq_data *irq_data)
{
+ /*
+ * This also makes sure that all domains point to the same root when
+ * called from irq_domain_insert_irq() for each domain in a hierarchy.
+ */
+ lockdep_assert_held(&domain->root->mutex);
+
if (irq_domain_is_nomap(domain))
return;
- mutex_lock(&domain->revmap_mutex);
if (hwirq < domain->revmap_size)
rcu_assign_pointer(domain->revmap[hwirq], irq_data);
else
radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
- mutex_unlock(&domain->revmap_mutex);
}
static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
@@ -538,6 +550,9 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
return;
hwirq = irq_data->hwirq;
+
+ mutex_lock(&domain->root->mutex);
+
irq_set_status_flags(irq, IRQ_NOREQUEST);
/* remove chip and handler */
@@ -557,10 +572,12 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
/* Clear reverse map for this hwirq */
irq_domain_clear_mapping(domain, hwirq);
+
+ mutex_unlock(&domain->root->mutex);
}
-int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq)
+static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
{
struct irq_data *irq_data = irq_get_irq_data(virq);
int ret;
@@ -573,7 +590,6 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
return -EINVAL;
- mutex_lock(&irq_domain_mutex);
irq_data->hwirq = hwirq;
irq_data->domain = domain;
if (domain->ops->map) {
@@ -590,23 +606,29 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
}
irq_data->domain = NULL;
irq_data->hwirq = 0;
- mutex_unlock(&irq_domain_mutex);
return ret;
}
-
- /* If not already assigned, give the domain the chip's name */
- if (!domain->name && irq_data->chip)
- domain->name = irq_data->chip->name;
}
domain->mapcount++;
irq_domain_set_mapping(domain, hwirq, irq_data);
- mutex_unlock(&irq_domain_mutex);
irq_clear_status_flags(virq, IRQ_NOREQUEST);
return 0;
}
+
+int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
+{
+ int ret;
+
+ mutex_lock(&domain->root->mutex);
+ ret = irq_domain_associate_locked(domain, virq, hwirq);
+ mutex_unlock(&domain->root->mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(irq_domain_associate);
void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
@@ -619,9 +641,8 @@ void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
of_node_full_name(of_node), irq_base, (int)hwirq_base, count);
- for (i = 0; i < count; i++) {
+ for (i = 0; i < count; i++)
irq_domain_associate(domain, irq_base + i, hwirq_base + i);
- }
}
EXPORT_SYMBOL_GPL(irq_domain_associate_many);
@@ -668,6 +689,34 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
#endif
+static unsigned int irq_create_mapping_affinity_locked(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ const struct irq_affinity_desc *affinity)
+{
+ struct device_node *of_node = irq_domain_get_of_node(domain);
+ int virq;
+
+ pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
+
+ /* Allocate a virtual interrupt number */
+ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node),
+ affinity);
+ if (virq <= 0) {
+ pr_debug("-> virq allocation failed\n");
+ return 0;
+ }
+
+ if (irq_domain_associate_locked(domain, virq, hwirq)) {
+ irq_free_desc(virq);
+ return 0;
+ }
+
+ pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
+ hwirq, of_node_full_name(of_node), virq);
+
+ return virq;
+}
+
/**
* irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space
* @domain: domain owning this hardware interrupt or NULL for default domain
@@ -680,14 +729,11 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
* on the number returned from that call.
*/
unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
- irq_hw_number_t hwirq,
- const struct irq_affinity_desc *affinity)
+ irq_hw_number_t hwirq,
+ const struct irq_affinity_desc *affinity)
{
- struct device_node *of_node;
int virq;
- pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
-
/* Look for default domain if necessary */
if (domain == NULL)
domain = irq_default_domain;
@@ -695,32 +741,19 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
return 0;
}
- pr_debug("-> using domain @%p\n", domain);
- of_node = irq_domain_get_of_node(domain);
+ mutex_lock(&domain->root->mutex);
/* Check if mapping already exists */
virq = irq_find_mapping(domain, hwirq);
if (virq) {
- pr_debug("-> existing mapping on virq %d\n", virq);
- return virq;
- }
-
- /* Allocate a virtual interrupt number */
- virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node),
- affinity);
- if (virq <= 0) {
- pr_debug("-> virq allocation failed\n");
- return 0;
- }
-
- if (irq_domain_associate(domain, virq, hwirq)) {
- irq_free_desc(virq);
- return 0;
+ pr_debug("existing mapping on virq %d\n", virq);
+ goto out;
}
- pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
- hwirq, of_node_full_name(of_node), virq);
+ virq = irq_create_mapping_affinity_locked(domain, hwirq, affinity);
+out:
+ mutex_unlock(&domain->root->mutex);
return virq;
}
@@ -789,6 +822,8 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))
type &= IRQ_TYPE_SENSE_MASK;
+ mutex_lock(&domain->root->mutex);
+
/*
* If we've already configured this interrupt,
* don't do it again, or hell will break loose.
@@ -801,7 +836,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
* interrupt number.
*/
if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
- return virq;
+ goto out;
/*
* If the trigger type has not been set yet, then set
@@ -809,40 +844,45 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
*/
if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {
irq_data = irq_get_irq_data(virq);
- if (!irq_data)
- return 0;
+ if (!irq_data) {
+ virq = 0;
+ goto out;
+ }
irqd_set_trigger_type(irq_data, type);
- return virq;
+ goto out;
}
pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n",
hwirq, of_node_full_name(to_of_node(fwspec->fwnode)));
- return 0;
+ virq = 0;
+ goto out;
}
if (irq_domain_is_hierarchy(domain)) {
- virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
- if (virq <= 0)
- return 0;
+ virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE,
+ fwspec, false, NULL);
+ if (virq <= 0) {
+ virq = 0;
+ goto out;
+ }
} else {
/* Create mapping */
- virq = irq_create_mapping(domain, hwirq);
+ virq = irq_create_mapping_affinity_locked(domain, hwirq, NULL);
if (!virq)
- return virq;
+ goto out;
}
irq_data = irq_get_irq_data(virq);
- if (!irq_data) {
- if (irq_domain_is_hierarchy(domain))
- irq_domain_free_irqs(virq, 1);
- else
- irq_dispose_mapping(virq);
- return 0;
+ if (WARN_ON(!irq_data)) {
+ virq = 0;
+ goto out;
}
/* Store trigger type */
irqd_set_trigger_type(irq_data, type);
+out:
+ mutex_unlock(&domain->root->mutex);
return virq;
}
@@ -1102,12 +1142,17 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
struct irq_domain *domain;
if (size)
- domain = irq_domain_create_linear(fwnode, size, ops, host_data);
+ domain = __irq_domain_create(fwnode, size, size, 0, ops, host_data);
else
- domain = irq_domain_create_tree(fwnode, ops, host_data);
+ domain = __irq_domain_create(fwnode, 0, ~0, 0, ops, host_data);
+
if (domain) {
+ if (parent)
+ domain->root = parent->root;
domain->parent = parent;
domain->flags |= flags;
+
+ __irq_domain_publish(domain);
}
return domain;
@@ -1123,10 +1168,6 @@ static void irq_domain_insert_irq(int virq)
domain->mapcount++;
irq_domain_set_mapping(domain, data->hwirq, data);
-
- /* If not already assigned, give the domain the chip's name */
- if (!domain->name && data->chip)
- domain->name = data->chip->name;
}
irq_clear_status_flags(virq, IRQ_NOREQUEST);
@@ -1426,40 +1467,12 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
return domain->ops->alloc(domain, irq_base, nr_irqs, arg);
}
-/**
- * __irq_domain_alloc_irqs - Allocate IRQs from domain
- * @domain: domain to allocate from
- * @irq_base: allocate specified IRQ number if irq_base >= 0
- * @nr_irqs: number of IRQs to allocate
- * @node: NUMA node id for memory allocation
- * @arg: domain specific argument
- * @realloc: IRQ descriptors have already been allocated if true
- * @affinity: Optional irq affinity mask for multiqueue devices
- *
- * Allocate IRQ numbers and initialized all data structures to support
- * hierarchy IRQ domains.
- * Parameter @realloc is mainly to support legacy IRQs.
- * Returns error code or allocated IRQ number
- *
- * The whole process to setup an IRQ has been split into two steps.
- * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
- * descriptor and required hardware resources. The second step,
- * irq_domain_activate_irq(), is to program the hardware with preallocated
- * resources. In this way, it's easier to rollback when failing to
- * allocate resources.
- */
-int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
- unsigned int nr_irqs, int node, void *arg,
- bool realloc, const struct irq_affinity_desc *affinity)
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
{
int i, ret, virq;
- if (domain == NULL) {
- domain = irq_default_domain;
- if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
- return -EINVAL;
- }
-
if (realloc && irq_base >= 0) {
virq = irq_base;
} else {
@@ -1478,24 +1491,18 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
goto out_free_desc;
}
- mutex_lock(&irq_domain_mutex);
ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg);
- if (ret < 0) {
- mutex_unlock(&irq_domain_mutex);
+ if (ret < 0)
goto out_free_irq_data;
- }
for (i = 0; i < nr_irqs; i++) {
ret = irq_domain_trim_hierarchy(virq + i);
- if (ret) {
- mutex_unlock(&irq_domain_mutex);
+ if (ret)
goto out_free_irq_data;
- }
}
-
+
for (i = 0; i < nr_irqs; i++)
irq_domain_insert_irq(virq + i);
- mutex_unlock(&irq_domain_mutex);
return virq;
@@ -1505,6 +1512,48 @@ out_free_desc:
irq_free_descs(virq, nr_irqs);
return ret;
}
+
+/**
+ * __irq_domain_alloc_irqs - Allocate IRQs from domain
+ * @domain: domain to allocate from
+ * @irq_base: allocate specified IRQ number if irq_base >= 0
+ * @nr_irqs: number of IRQs to allocate
+ * @node: NUMA node id for memory allocation
+ * @arg: domain specific argument
+ * @realloc: IRQ descriptors have already been allocated if true
+ * @affinity: Optional irq affinity mask for multiqueue devices
+ *
+ * Allocate IRQ numbers and initialized all data structures to support
+ * hierarchy IRQ domains.
+ * Parameter @realloc is mainly to support legacy IRQs.
+ * Returns error code or allocated IRQ number
+ *
+ * The whole process to setup an IRQ has been split into two steps.
+ * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
+ * descriptor and required hardware resources. The second step,
+ * irq_domain_activate_irq(), is to program the hardware with preallocated
+ * resources. In this way, it's easier to rollback when failing to
+ * allocate resources.
+ */
+int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
+{
+ int ret;
+
+ if (domain == NULL) {
+ domain = irq_default_domain;
+ if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
+ return -EINVAL;
+ }
+
+ mutex_lock(&domain->root->mutex);
+ ret = irq_domain_alloc_irqs_locked(domain, irq_base, nr_irqs, node, arg,
+ realloc, affinity);
+ mutex_unlock(&domain->root->mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs);
/* The irq_data was moved, fix the revmap to refer to the new location */
@@ -1512,11 +1561,12 @@ static void irq_domain_fix_revmap(struct irq_data *d)
{
void __rcu **slot;
+ lockdep_assert_held(&d->domain->root->mutex);
+
if (irq_domain_is_nomap(d->domain))
return;
/* Fix up the revmap. */
- mutex_lock(&d->domain->revmap_mutex);
if (d->hwirq < d->domain->revmap_size) {
/* Not using radix tree */
rcu_assign_pointer(d->domain->revmap[d->hwirq], d);
@@ -1525,7 +1575,6 @@ static void irq_domain_fix_revmap(struct irq_data *d)
if (slot)
radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
}
- mutex_unlock(&d->domain->revmap_mutex);
}
/**
@@ -1541,8 +1590,8 @@ static void irq_domain_fix_revmap(struct irq_data *d)
*/
int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
{
- struct irq_data *child_irq_data;
- struct irq_data *root_irq_data = irq_get_irq_data(virq);
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *parent_irq_data;
struct irq_desc *desc;
int rv = 0;
@@ -1567,47 +1616,46 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
if (WARN_ON(!irq_domain_is_hierarchy(domain)))
return -EINVAL;
- if (!root_irq_data)
+ if (!irq_data)
return -EINVAL;
- if (domain->parent != root_irq_data->domain)
+ if (domain->parent != irq_data->domain)
return -EINVAL;
- child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL,
- irq_data_get_node(root_irq_data));
- if (!child_irq_data)
+ parent_irq_data = kzalloc_node(sizeof(*parent_irq_data), GFP_KERNEL,
+ irq_data_get_node(irq_data));
+ if (!parent_irq_data)
return -ENOMEM;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
/* Copy the original irq_data. */
- *child_irq_data = *root_irq_data;
+ *parent_irq_data = *irq_data;
/*
- * Overwrite the root_irq_data, which is embedded in struct
- * irq_desc, with values for this domain.
+ * Overwrite the irq_data, which is embedded in struct irq_desc, with
+ * values for this domain.
*/
- root_irq_data->parent_data = child_irq_data;
- root_irq_data->domain = domain;
- root_irq_data->mask = 0;
- root_irq_data->hwirq = 0;
- root_irq_data->chip = NULL;
- root_irq_data->chip_data = NULL;
+ irq_data->parent_data = parent_irq_data;
+ irq_data->domain = domain;
+ irq_data->mask = 0;
+ irq_data->hwirq = 0;
+ irq_data->chip = NULL;
+ irq_data->chip_data = NULL;
/* May (probably does) set hwirq, chip, etc. */
rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
if (rv) {
/* Restore the original irq_data. */
- *root_irq_data = *child_irq_data;
- kfree(child_irq_data);
+ *irq_data = *parent_irq_data;
+ kfree(parent_irq_data);
goto error;
}
- irq_domain_fix_revmap(child_irq_data);
- irq_domain_set_mapping(domain, root_irq_data->hwirq, root_irq_data);
-
+ irq_domain_fix_revmap(parent_irq_data);
+ irq_domain_set_mapping(domain, irq_data->hwirq, irq_data);
error:
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
return rv;
}
@@ -1623,8 +1671,8 @@ EXPORT_SYMBOL_GPL(irq_domain_push_irq);
*/
int irq_domain_pop_irq(struct irq_domain *domain, int virq)
{
- struct irq_data *root_irq_data = irq_get_irq_data(virq);
- struct irq_data *child_irq_data;
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *parent_irq_data;
struct irq_data *tmp_irq_data;
struct irq_desc *desc;
@@ -1646,37 +1694,37 @@ int irq_domain_pop_irq(struct irq_domain *domain, int virq)
if (domain == NULL)
return -EINVAL;
- if (!root_irq_data)
+ if (!irq_data)
return -EINVAL;
tmp_irq_data = irq_domain_get_irq_data(domain, virq);
/* We can only "pop" if this domain is at the top of the list */
- if (WARN_ON(root_irq_data != tmp_irq_data))
+ if (WARN_ON(irq_data != tmp_irq_data))
return -EINVAL;
- if (WARN_ON(root_irq_data->domain != domain))
+ if (WARN_ON(irq_data->domain != domain))
return -EINVAL;
- child_irq_data = root_irq_data->parent_data;
- if (WARN_ON(!child_irq_data))
+ parent_irq_data = irq_data->parent_data;
+ if (WARN_ON(!parent_irq_data))
return -EINVAL;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
- root_irq_data->parent_data = NULL;
+ irq_data->parent_data = NULL;
- irq_domain_clear_mapping(domain, root_irq_data->hwirq);
+ irq_domain_clear_mapping(domain, irq_data->hwirq);
irq_domain_free_irqs_hierarchy(domain, virq, 1);
/* Restore the original irq_data. */
- *root_irq_data = *child_irq_data;
+ *irq_data = *parent_irq_data;
- irq_domain_fix_revmap(root_irq_data);
+ irq_domain_fix_revmap(irq_data);
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
- kfree(child_irq_data);
+ kfree(parent_irq_data);
return 0;
}
@@ -1690,17 +1738,20 @@ EXPORT_SYMBOL_GPL(irq_domain_pop_irq);
void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
{
struct irq_data *data = irq_get_irq_data(virq);
+ struct irq_domain *domain;
int i;
if (WARN(!data || !data->domain || !data->domain->ops->free,
"NULL pointer, cannot free irq\n"))
return;
- mutex_lock(&irq_domain_mutex);
+ domain = data->domain;
+
+ mutex_lock(&domain->root->mutex);
for (i = 0; i < nr_irqs; i++)
irq_domain_remove_irq(virq + i);
- irq_domain_free_irqs_hierarchy(data->domain, virq, nr_irqs);
- mutex_unlock(&irq_domain_mutex);
+ irq_domain_free_irqs_hierarchy(domain, virq, nr_irqs);
+ mutex_unlock(&domain->root->mutex);
irq_domain_free_irq_data(virq, nr_irqs);
irq_free_descs(virq, nr_irqs);
@@ -1815,20 +1866,6 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
if (domain->ops->alloc)
domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
}
-
-/**
- * irq_domain_hierarchical_is_msi_remap - Check if the domain or any
- * parent has MSI remapping support
- * @domain: domain pointer
- */
-bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain)
-{
- for (; domain; domain = domain->parent) {
- if (irq_domain_is_msi_remap(domain))
- return true;
- }
- return false;
-}
#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
/**
* irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
@@ -1865,6 +1902,13 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
irq_set_handler_data(virq, handler_data);
}
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
+{
+ return -EINVAL;
+}
+
static void irq_domain_check_hierarchy(struct irq_domain *domain)
{
}
@@ -1915,7 +1959,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d)
static void debugfs_remove_domain_dir(struct irq_domain *d)
{
- debugfs_remove(debugfs_lookup(d->name, domain_dir));
+ debugfs_lookup_and_remove(d->name, domain_dir);
}
void __init irq_domain_debugfs_init(struct dentry *root)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5b7cf28df290..8ce75495e04f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -723,10 +723,13 @@ EXPORT_SYMBOL(disable_irq_nosync);
* to complete before returning. If you use this function while
* holding a resource the IRQ handler may need you will deadlock.
*
- * This function may be called - with care - from IRQ context.
+ * Can only be called from preemptible code as it might sleep when
+ * an interrupt thread is associated to @irq.
+ *
*/
void disable_irq(unsigned int irq)
{
+ might_sleep();
if (!__disable_irq_nosync(irq))
synchronize_irq(irq);
}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 955267bbc2be..7a97bcb086bf 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -830,11 +830,8 @@ static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode,
domain = irq_domain_create_hierarchy(parent, flags | IRQ_DOMAIN_FLAG_MSI, 0,
fwnode, &msi_domain_ops, info);
- if (domain) {
- if (!domain->name && info->chip)
- domain->name = info->chip->name;
+ if (domain)
irq_domain_update_bus_token(domain, info->bus_token);
- }
return domain;
}
@@ -1000,7 +997,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
fail:
msi_unlock_descs(dev);
free_fwnode:
- kfree(fwnode);
+ irq_domain_free_fwnode(fwnode);
free_bundle:
kfree(bundle);
return false;
@@ -1013,6 +1010,7 @@ free_bundle:
*/
void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
{
+ struct fwnode_handle *fwnode = NULL;
struct msi_domain_info *info;
struct irq_domain *domain;
@@ -1025,7 +1023,10 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
dev->msi.data->__domains[domid].domain = NULL;
info = domain->host_data;
+ if (irq_domain_is_msi_device(domain))
+ fwnode = domain->fwnode;
irq_domain_remove(domain);
+ irq_domain_free_fwnode(fwnode);
kfree(container_of(info, struct msi_domain_template, info));
unlock:
@@ -1080,10 +1081,13 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
struct xarray *xa;
int ret, virq;
- if (!msi_ctrl_valid(dev, &ctrl))
- return -EINVAL;
-
msi_lock_descs(dev);
+
+ if (!msi_ctrl_valid(dev, &ctrl)) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
ret = msi_domain_add_simple_msi_descs(dev, &ctrl);
if (ret)
goto unlock;
@@ -1105,14 +1109,35 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
return 0;
fail:
- for (--virq; virq >= virq_base; virq--)
+ for (--virq; virq >= virq_base; virq--) {
+ msi_domain_depopulate_descs(dev, virq, 1);
irq_domain_free_irqs_common(domain, virq, 1);
+ }
msi_domain_free_descs(dev, &ctrl);
unlock:
msi_unlock_descs(dev);
return ret;
}
+void msi_domain_depopulate_descs(struct device *dev, int virq_base, int nvec)
+{
+ struct msi_ctrl ctrl = {
+ .domid = MSI_DEFAULT_DOMAIN,
+ .first = virq_base,
+ .last = virq_base + nvec - 1,
+ };
+ struct msi_desc *desc;
+ struct xarray *xa;
+ unsigned long idx;
+
+ if (!msi_ctrl_valid(dev, &ctrl))
+ return;
+
+ xa = &dev->msi.data->__domains[ctrl.domid].store;
+ xa_for_each_range(xa, idx, desc, ctrl.first, ctrl.last)
+ desc->irq = 0;
+}
+
/*
* Carefully check whether the device can use reservation mode. If
* reservation mode is enabled then the early activation will assign a
@@ -1623,3 +1648,30 @@ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
{
return (struct msi_domain_info *)domain->host_data;
}
+
+/**
+ * msi_device_has_isolated_msi - True if the device has isolated MSI
+ * @dev: The device to check
+ *
+ * Isolated MSI means that HW modeled by an irq_domain on the path from the
+ * initiating device to the CPU will validate that the MSI message specifies an
+ * interrupt number that the device is authorized to trigger. This must block
+ * devices from triggering interrupts they are not authorized to trigger.
+ * Currently authorization means the MSI vector is one assigned to the device.
+ *
+ * This is interesting for securing VFIO use cases where a rouge MSI (eg created
+ * by abusing a normal PCI MemWr DMA) must not allow the VFIO userspace to
+ * impact outside its security domain, eg userspace triggering interrupts on
+ * kernel drivers, a VM triggering interrupts on the hypervisor, or a VM
+ * triggering interrupts on another VM.
+ */
+bool msi_device_has_isolated_msi(struct device *dev)
+{
+ struct irq_domain *domain = dev_get_msi_domain(dev);
+
+ for (; domain; domain = domain->parent)
+ if (domain->flags & IRQ_DOMAIN_FLAG_ISOLATED_MSI)
+ return true;
+ return arch_is_isolated_msi();
+}
+EXPORT_SYMBOL_GPL(msi_device_has_isolated_msi);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index e5cd09fd8a05..84c717337df0 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -489,7 +489,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
goto exit;
}
spin_unlock_irqrestore(&kcov->lock, flags);
- vma->vm_flags |= VM_DONTEXPAND;
+ vm_flags_set(vma, VM_DONTEXPAND);
for (off = 0; off < size; off += PAGE_SIZE) {
page = vmalloc_to_page(kcov->area + off);
res = vm_insert_page(vma, vma->vm_start + off, page);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index cb8e6e6f983c..92d301f98776 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -190,10 +190,12 @@ out_unlock:
static inline int kexec_load_check(unsigned long nr_segments,
unsigned long flags)
{
+ int image_type = (flags & KEXEC_ON_CRASH) ?
+ KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
int result;
/* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ if (!kexec_load_permitted(image_type))
return -EPERM;
/* Permit LSMs and IMA to fail the kexec */
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 969e8f52f7da..3d578c6fefee 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -6,6 +6,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/btf.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
@@ -920,10 +921,64 @@ int kimage_load_segment(struct kimage *image,
return result;
}
+struct kexec_load_limit {
+ /* Mutex protects the limit count. */
+ struct mutex mutex;
+ int limit;
+};
+
+static struct kexec_load_limit load_limit_reboot = {
+ .mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex),
+ .limit = -1,
+};
+
+static struct kexec_load_limit load_limit_panic = {
+ .mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex),
+ .limit = -1,
+};
+
struct kimage *kexec_image;
struct kimage *kexec_crash_image;
-int kexec_load_disabled;
+static int kexec_load_disabled;
+
#ifdef CONFIG_SYSCTL
+static int kexec_limit_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct kexec_load_limit *limit = table->data;
+ int val;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ };
+ int ret;
+
+ if (write) {
+ ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ if (val < 0)
+ return -EINVAL;
+
+ mutex_lock(&limit->mutex);
+ if (limit->limit != -1 && val >= limit->limit)
+ ret = -EINVAL;
+ else
+ limit->limit = val;
+ mutex_unlock(&limit->mutex);
+
+ return ret;
+ }
+
+ mutex_lock(&limit->mutex);
+ val = limit->limit;
+ mutex_unlock(&limit->mutex);
+
+ return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+}
+
static struct ctl_table kexec_core_sysctls[] = {
{
.procname = "kexec_load_disabled",
@@ -935,6 +990,18 @@ static struct ctl_table kexec_core_sysctls[] = {
.extra1 = SYSCTL_ONE,
.extra2 = SYSCTL_ONE,
},
+ {
+ .procname = "kexec_load_limit_panic",
+ .data = &load_limit_panic,
+ .mode = 0644,
+ .proc_handler = kexec_limit_handler,
+ },
+ {
+ .procname = "kexec_load_limit_reboot",
+ .data = &load_limit_reboot,
+ .mode = 0644,
+ .proc_handler = kexec_limit_handler,
+ },
{ }
};
@@ -946,6 +1013,32 @@ static int __init kexec_core_sysctl_init(void)
late_initcall(kexec_core_sysctl_init);
#endif
+bool kexec_load_permitted(int kexec_image_type)
+{
+ struct kexec_load_limit *limit;
+
+ /*
+ * Only the superuser can use the kexec syscall and if it has not
+ * been disabled.
+ */
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return false;
+
+ /* Check limit counter and decrease it.*/
+ limit = (kexec_image_type == KEXEC_TYPE_CRASH) ?
+ &load_limit_panic : &load_limit_reboot;
+ mutex_lock(&limit->mutex);
+ if (!limit->limit) {
+ mutex_unlock(&limit->mutex);
+ return false;
+ }
+ if (limit->limit != -1)
+ limit->limit--;
+ mutex_unlock(&limit->mutex);
+
+ return true;
+}
+
/*
* No panic_cpu check version of crash_kexec(). This function is called
* only when panic_cpu holds the current CPU number; this is the only CPU
@@ -975,7 +1068,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
}
STACK_FRAME_NON_STANDARD(__crash_kexec);
-void crash_kexec(struct pt_regs *regs)
+__bpf_kfunc void crash_kexec(struct pt_regs *regs)
{
int old_cpu, this_cpu;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index dd5983010b7b..f1a0e4e3fb5c 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -326,11 +326,13 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
unsigned long, cmdline_len, const char __user *, cmdline_ptr,
unsigned long, flags)
{
- int ret = 0, i;
+ int image_type = (flags & KEXEC_FILE_ON_CRASH) ?
+ KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
struct kimage **dest_image, *image;
+ int ret = 0, i;
/* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ if (!kexec_load_permitted(image_type))
return -EPERM;
/* Make sure we have a legal set of flags */
@@ -342,11 +344,12 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
if (!kexec_trylock())
return -EBUSY;
- dest_image = &kexec_image;
- if (flags & KEXEC_FILE_ON_CRASH) {
+ if (image_type == KEXEC_TYPE_CRASH) {
dest_image = &kexec_crash_image;
if (kexec_crash_image)
arch_kexec_unprotect_crashkres();
+ } else {
+ dest_image = &kexec_image;
}
if (flags & KEXEC_FILE_UNLOAD)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1c18ecf9f98b..00e177de91cc 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -458,7 +458,7 @@ static inline int kprobe_optready(struct kprobe *p)
}
/* Return true if the kprobe is disarmed. Note: p must be on hash list */
-static inline bool kprobe_disarmed(struct kprobe *p)
+bool kprobe_disarmed(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -555,17 +555,15 @@ static void do_unoptimize_kprobes(void)
/* See comment in do_optimize_kprobes() */
lockdep_assert_cpus_held();
- /* Unoptimization must be done anytime */
- if (list_empty(&unoptimizing_list))
- return;
+ if (!list_empty(&unoptimizing_list))
+ arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
- arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
- /* Loop on 'freeing_list' for disarming */
+ /* Loop on 'freeing_list' for disarming and removing from kprobe hash list */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
/* Switching from detour code to origin */
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
- /* Disarm probes if marked disabled */
- if (kprobe_disabled(&op->kp))
+ /* Disarm probes if marked disabled and not gone */
+ if (kprobe_disabled(&op->kp) && !kprobe_gone(&op->kp))
arch_disarm_kprobe(&op->kp);
if (kprobe_unused(&op->kp)) {
/*
@@ -662,7 +660,7 @@ void wait_for_kprobe_optimizer(void)
mutex_unlock(&kprobe_mutex);
}
-static bool optprobe_queued_unopt(struct optimized_kprobe *op)
+bool optprobe_queued_unopt(struct optimized_kprobe *op)
{
struct optimized_kprobe *_op;
@@ -797,14 +795,13 @@ static void kill_optimized_kprobe(struct kprobe *p)
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (kprobe_unused(p)) {
- /* Enqueue if it is unused */
- list_add(&op->list, &freeing_list);
/*
- * Remove unused probes from the hash list. After waiting
- * for synchronization, this probe is reclaimed.
- * (reclaiming is done by do_free_cleaned_kprobes().)
+ * Unused kprobe is on unoptimizing or freeing list. We move it
+ * to freeing_list and let the kprobe_optimizer() remove it from
+ * the kprobe hash list and free it.
*/
- hlist_del_rcu(&op->kp.hlist);
+ if (optprobe_queued_unopt(op))
+ list_move(&op->list, &freeing_list);
}
/* Don't touch the code, because it is already freed. */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 2df00b789b90..0408aab80941 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -51,6 +51,14 @@ static ssize_t cpu_byteorder_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(cpu_byteorder);
+/* address bits */
+static ssize_t address_bits_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%zu\n", sizeof(void *) * 8 /* CHAR_BIT */);
+}
+KERNEL_ATTR_RO(address_bits);
+
#ifdef CONFIG_UEVENT_HELPER
/* uevent helper program, used during early boot */
static ssize_t uevent_helper_show(struct kobject *kobj,
@@ -233,6 +241,7 @@ static struct attribute * kernel_attrs[] = {
&fscaps_attr.attr,
&uevent_seqnum_attr.attr,
&cpu_byteorder_attr.attr,
+ &address_bits_attr.attr,
#ifdef CONFIG_UEVENT_HELPER
&uevent_helper_attr.attr,
#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index f97fd01a2932..7e6751b29101 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1382,6 +1382,10 @@ EXPORT_SYMBOL_GPL(kthread_flush_worker);
* Flush and destroy @worker. The simple flush is enough because the kthread
* worker API is used only in trivial scenarios. There are no multi-step state
* machines needed.
+ *
+ * Note that this function is not responsible for handling delayed work, so
+ * caller should be responsible for queuing or canceling all delayed work items
+ * before invoke this function.
*/
void kthread_destroy_worker(struct kthread_worker *worker)
{
@@ -1393,6 +1397,7 @@ void kthread_destroy_worker(struct kthread_worker *worker)
kthread_flush_worker(worker);
kthread_stop(task);
+ WARN_ON(!list_empty(&worker->delayed_work_list));
WARN_ON(!list_empty(&worker->work_list));
kfree(worker);
}
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 201f0c0482fb..4bd2d5e10f20 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -118,7 +118,6 @@ static struct klp_object *klp_find_object(struct klp_patch *patch,
}
struct klp_find_arg {
- const char *objname;
const char *name;
unsigned long addr;
unsigned long count;
@@ -148,15 +147,9 @@ static int klp_find_callback(void *data, const char *name,
{
struct klp_find_arg *args = data;
- if ((mod && !args->objname) || (!mod && args->objname))
- return 0;
-
if (strcmp(args->name, name))
return 0;
- if (args->objname && strcmp(args->objname, mod->name))
- return 0;
-
return klp_match_callback(data, addr);
}
@@ -164,7 +157,6 @@ static int klp_find_object_symbol(const char *objname, const char *name,
unsigned long sympos, unsigned long *addr)
{
struct klp_find_arg args = {
- .objname = objname,
.name = name,
.addr = 0,
.count = 0,
@@ -172,7 +164,7 @@ static int klp_find_object_symbol(const char *objname, const char *name,
};
if (objname)
- module_kallsyms_on_each_symbol(klp_find_callback, &args);
+ module_kallsyms_on_each_symbol(objname, klp_find_callback, &args);
else
kallsyms_on_each_match_symbol(klp_match_callback, name, &args);
@@ -268,6 +260,14 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
return 0;
}
+void __weak clear_relocate_add(Elf_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+}
+
/*
* At a high-level, there are two types of klp relocation sections: those which
* reference symbols which live in vmlinux; and those which reference symbols
@@ -291,10 +291,10 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
* the to-be-patched module to be loaded and patched sometime *after* the
* klp module is loaded.
*/
-int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
- const char *shstrtab, const char *strtab,
- unsigned int symndx, unsigned int secndx,
- const char *objname)
+static int klp_write_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
+ const char *shstrtab, const char *strtab,
+ unsigned int symndx, unsigned int secndx,
+ const char *objname, bool apply)
{
int cnt, ret;
char sec_objname[MODULE_NAME_LEN];
@@ -316,11 +316,26 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
if (strcmp(objname ? objname : "vmlinux", sec_objname))
return 0;
- ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec, sec_objname);
- if (ret)
- return ret;
+ if (apply) {
+ ret = klp_resolve_symbols(sechdrs, strtab, symndx,
+ sec, sec_objname);
+ if (ret)
+ return ret;
+
+ return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
+ }
+
+ clear_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
+ return 0;
+}
- return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
+int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
+ const char *shstrtab, const char *strtab,
+ unsigned int symndx, unsigned int secndx,
+ const char *objname)
+{
+ return klp_write_section_relocs(pmod, sechdrs, shstrtab, strtab, symndx,
+ secndx, objname, true);
}
/*
@@ -769,8 +784,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
func->old_sympos ? func->old_sympos : 1);
}
-static int klp_apply_object_relocs(struct klp_patch *patch,
- struct klp_object *obj)
+static int klp_write_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj,
+ bool apply)
{
int i, ret;
struct klp_modinfo *info = patch->mod->klp_info;
@@ -781,10 +797,10 @@ static int klp_apply_object_relocs(struct klp_patch *patch,
if (!(sec->sh_flags & SHF_RELA_LIVEPATCH))
continue;
- ret = klp_apply_section_relocs(patch->mod, info->sechdrs,
+ ret = klp_write_section_relocs(patch->mod, info->sechdrs,
info->secstrings,
patch->mod->core_kallsyms.strtab,
- info->symndx, i, obj->name);
+ info->symndx, i, obj->name, apply);
if (ret)
return ret;
}
@@ -792,6 +808,18 @@ static int klp_apply_object_relocs(struct klp_patch *patch,
return 0;
}
+static int klp_apply_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ return klp_write_object_relocs(patch, obj, true);
+}
+
+static void klp_clear_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ klp_write_object_relocs(patch, obj, false);
+}
+
/* parts of the initialization that is done only when the object is loaded */
static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_object *obj)
@@ -1179,7 +1207,7 @@ static void klp_cleanup_module_patches_limited(struct module *mod,
klp_unpatch_object(obj);
klp_post_unpatch_callback(obj);
-
+ klp_clear_object_relocs(patch, obj);
klp_free_object_loaded(obj);
break;
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e3375bc40dad..50d4863974e7 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -55,6 +55,7 @@
#include <linux/rcupdate.h>
#include <linux/kprobes.h>
#include <linux/lockdep.h>
+#include <linux/context_tracking.h>
#include <asm/sections.h>
@@ -6555,6 +6556,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
struct task_struct *curr = current;
int dl = READ_ONCE(debug_locks);
+ bool rcu = warn_rcu_enter();
/* Note: the following can be executed concurrently, so be careful. */
pr_warn("\n");
@@ -6595,5 +6597,6 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
lockdep_print_held_locks(curr);
pr_warn("\nstack backtrace:\n");
dump_stack();
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 9c2fb613a55d..f04b1978899d 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -46,6 +46,9 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
torture_param(int, stat_interval, 60,
"Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
+torture_param(int, rt_boost, 2,
+ "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
+torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
torture_param(int, verbose, 1,
"Enable verbose debugging printk()s");
@@ -127,15 +130,50 @@ static void torture_lock_busted_write_unlock(int tid __maybe_unused)
/* BUGGY, do not use in real life!!! */
}
-static void torture_boost_dummy(struct torture_random_state *trsp)
+static void __torture_rt_boost(struct torture_random_state *trsp)
{
- /* Only rtmutexes care about priority */
+ const unsigned int factor = rt_boost_factor;
+
+ if (!rt_task(current)) {
+ /*
+ * Boost priority once every rt_boost_factor operations. When
+ * the task tries to take the lock, the rtmutex it will account
+ * for the new priority, and do any corresponding pi-dance.
+ */
+ if (trsp && !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor))) {
+ sched_set_fifo(current);
+ } else /* common case, do nothing */
+ return;
+ } else {
+ /*
+ * The task will remain boosted for another 10 * rt_boost_factor
+ * operations, then restored back to its original prio, and so
+ * forth.
+ *
+ * When @trsp is nil, we want to force-reset the task for
+ * stopping the kthread.
+ */
+ if (!trsp || !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor * 2))) {
+ sched_set_normal(current, 0);
+ } else /* common case, do nothing */
+ return;
+ }
+}
+
+static void torture_rt_boost(struct torture_random_state *trsp)
+{
+ if (rt_boost != 2)
+ return;
+
+ __torture_rt_boost(trsp);
}
static struct lock_torture_ops lock_busted_ops = {
.writelock = torture_lock_busted_write_lock,
.write_delay = torture_lock_busted_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_busted_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -179,7 +217,7 @@ __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_ops = {
.writelock = torture_spin_lock_write_lock,
.write_delay = torture_spin_lock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_spin_lock_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -206,7 +244,7 @@ __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_irq_ops = {
.writelock = torture_spin_lock_write_lock_irq,
.write_delay = torture_spin_lock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_spin_write_unlock_irq,
.readlock = NULL,
.read_delay = NULL,
@@ -275,7 +313,7 @@ __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_ops = {
.writelock = torture_rwlock_write_lock,
.write_delay = torture_rwlock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock,
.readlock = torture_rwlock_read_lock,
.read_delay = torture_rwlock_read_delay,
@@ -318,7 +356,7 @@ __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_irq_ops = {
.writelock = torture_rwlock_write_lock_irq,
.write_delay = torture_rwlock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock_irq,
.readlock = torture_rwlock_read_lock_irq,
.read_delay = torture_rwlock_read_delay,
@@ -358,7 +396,7 @@ __releases(torture_mutex)
static struct lock_torture_ops mutex_lock_ops = {
.writelock = torture_mutex_lock,
.write_delay = torture_mutex_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_mutex_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -456,7 +494,7 @@ static struct lock_torture_ops ww_mutex_lock_ops = {
.exit = torture_ww_mutex_exit,
.writelock = torture_ww_mutex_lock,
.write_delay = torture_mutex_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_ww_mutex_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -474,37 +512,6 @@ __acquires(torture_rtmutex)
return 0;
}
-static void torture_rtmutex_boost(struct torture_random_state *trsp)
-{
- const unsigned int factor = 50000; /* yes, quite arbitrary */
-
- if (!rt_task(current)) {
- /*
- * Boost priority once every ~50k operations. When the
- * task tries to take the lock, the rtmutex it will account
- * for the new priority, and do any corresponding pi-dance.
- */
- if (trsp && !(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor))) {
- sched_set_fifo(current);
- } else /* common case, do nothing */
- return;
- } else {
- /*
- * The task will remain boosted for another ~500k operations,
- * then restored back to its original prio, and so forth.
- *
- * When @trsp is nil, we want to force-reset the task for
- * stopping the kthread.
- */
- if (!trsp || !(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor * 2))) {
- sched_set_normal(current, 0);
- } else /* common case, do nothing */
- return;
- }
-}
-
static void torture_rtmutex_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
@@ -530,10 +537,18 @@ __releases(torture_rtmutex)
rt_mutex_unlock(&torture_rtmutex);
}
+static void torture_rt_boost_rtmutex(struct torture_random_state *trsp)
+{
+ if (!rt_boost)
+ return;
+
+ __torture_rt_boost(trsp);
+}
+
static struct lock_torture_ops rtmutex_lock_ops = {
.writelock = torture_rtmutex_lock,
.write_delay = torture_rtmutex_delay,
- .task_boost = torture_rtmutex_boost,
+ .task_boost = torture_rt_boost_rtmutex,
.writeunlock = torture_rtmutex_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -600,7 +615,7 @@ __releases(torture_rwsem)
static struct lock_torture_ops rwsem_lock_ops = {
.writelock = torture_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwsem_up_write,
.readlock = torture_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
@@ -652,7 +667,7 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {
.exit = torture_percpu_rwsem_exit,
.writelock = torture_percpu_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_percpu_rwsem_up_write,
.readlock = torture_percpu_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 2b23378775fe..ebe6b8ec7cb3 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -371,7 +371,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
/*
* We're pending, wait for the owner to go away.
*
- * 0,1,1 -> 0,1,0
+ * 0,1,1 -> *,1,0
*
* this wait loop must be a load-acquire such that we match the
* store-release that clears the locked bit and create lock
@@ -380,7 +380,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* barriers.
*/
if (val & _Q_LOCKED_MASK)
- atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK));
+ smp_cond_load_acquire(&lock->locked, !VAL);
/*
* take ownership and clear the pending bit.
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 010cf4e6d0b8..728f434de2bb 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -901,8 +901,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* then we need to wake the new top waiter up to try
* to get the lock.
*/
- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
- wake_up_state(waiter->task, waiter->wake_state);
+ top_waiter = rt_mutex_top_waiter(lock);
+ if (prerequeue_top_waiter != top_waiter)
+ wake_up_state(top_waiter->task, top_waiter->wake_state);
raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 44873594de03..acb5a50309a1 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -256,16 +256,13 @@ static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)
static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
{
long tmp = RWSEM_UNLOCKED_VALUE;
- bool ret = false;
- preempt_disable();
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {
rwsem_set_owner(sem);
- ret = true;
+ return true;
}
- preempt_enable();
- return ret;
+ return false;
}
/*
@@ -624,18 +621,16 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
*/
if (first->handoff_set && (waiter != first))
return false;
-
- /*
- * First waiter can inherit a previously set handoff
- * bit and spin on rwsem if lock acquisition fails.
- */
- if (waiter == first)
- waiter->handoff_set = true;
}
new = count;
if (count & RWSEM_LOCK_MASK) {
+ /*
+ * A waiter (first or not) can set the handoff bit
+ * if it is an RT task or wait in the wait queue
+ * for too long.
+ */
if (has_handoff || (!rt_task(waiter->task) &&
!time_after(jiffies, waiter->timeout)))
return false;
@@ -651,11 +646,12 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
/*
- * We have either acquired the lock with handoff bit cleared or
- * set the handoff bit.
+ * We have either acquired the lock with handoff bit cleared or set
+ * the handoff bit. Only the first waiter can have its handoff_set
+ * set here to enable optimistic spinning in slowpath loop.
*/
if (new & RWSEM_FLAG_HANDOFF) {
- waiter->handoff_set = true;
+ first->handoff_set = true;
lockevent_inc(rwsem_wlock_handoff);
return false;
}
@@ -717,7 +713,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
return false;
}
- preempt_disable();
/*
* Disable preemption is equal to the RCU read-side crital section,
* thus the task_strcut structure won't go away.
@@ -729,7 +724,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
if ((flags & RWSEM_NONSPINNABLE) ||
(owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
ret = false;
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !ret);
return ret;
@@ -829,8 +823,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
int loop = 0;
u64 rspin_threshold = 0;
- preempt_disable();
-
/* sem->wait_lock should not be held when doing optimistic spinning */
if (!osq_lock(&sem->osq))
goto done;
@@ -938,7 +930,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
}
osq_unlock(&sem->osq);
done:
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !taken);
return taken;
}
@@ -1092,7 +1083,7 @@ queue:
/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
break;
}
- schedule();
+ schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_reader);
}
@@ -1179,15 +1170,12 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
if (waiter.handoff_set) {
enum owner_state owner_state;
- preempt_disable();
owner_state = rwsem_spin_on_owner(sem);
- preempt_enable();
-
if (owner_state == OWNER_NULL)
goto trylock_again;
}
- schedule();
+ schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_writer);
set_current_state(state);
trylock_again:
@@ -1254,14 +1242,20 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
*/
static inline int __down_read_common(struct rw_semaphore *sem, int state)
{
+ int ret = 0;
long count;
+ preempt_disable();
if (!rwsem_read_trylock(sem, &count)) {
- if (IS_ERR(rwsem_down_read_slowpath(sem, count, state)))
- return -EINTR;
+ if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) {
+ ret = -EINTR;
+ goto out;
+ }
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
}
- return 0;
+out:
+ preempt_enable();
+ return ret;
}
static inline void __down_read(struct rw_semaphore *sem)
@@ -1281,19 +1275,23 @@ static inline int __down_read_killable(struct rw_semaphore *sem)
static inline int __down_read_trylock(struct rw_semaphore *sem)
{
+ int ret = 0;
long tmp;
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+ preempt_disable();
tmp = atomic_long_read(&sem->count);
while (!(tmp & RWSEM_READ_FAILED_MASK)) {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
tmp + RWSEM_READER_BIAS)) {
rwsem_set_reader_owned(sem);
- return 1;
+ ret = 1;
+ break;
}
}
- return 0;
+ preempt_enable();
+ return ret;
}
/*
@@ -1301,12 +1299,15 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
*/
static inline int __down_write_common(struct rw_semaphore *sem, int state)
{
+ int ret = 0;
+
+ preempt_disable();
if (unlikely(!rwsem_write_trylock(sem))) {
if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
- return -EINTR;
+ ret = -EINTR;
}
-
- return 0;
+ preempt_enable();
+ return ret;
}
static inline void __down_write(struct rw_semaphore *sem)
@@ -1321,8 +1322,14 @@ static inline int __down_write_killable(struct rw_semaphore *sem)
static inline int __down_write_trylock(struct rw_semaphore *sem)
{
+ int ret;
+
+ preempt_disable();
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
- return rwsem_write_trylock(sem);
+ ret = rwsem_write_trylock(sem);
+ preempt_enable();
+
+ return ret;
}
/*
@@ -1335,6 +1342,7 @@ static inline void __up_read(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ preempt_disable();
rwsem_clear_reader_owned(sem);
tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
@@ -1343,6 +1351,7 @@ static inline void __up_read(struct rw_semaphore *sem)
clear_nonspinnable(sem);
rwsem_wake(sem);
}
+ preempt_enable();
}
/*
@@ -1363,9 +1372,9 @@ static inline void __up_write(struct rw_semaphore *sem)
preempt_disable();
rwsem_clear_owner(sem);
tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
- preempt_enable();
if (unlikely(tmp & RWSEM_FLAG_WAITERS))
rwsem_wake(sem);
+ preempt_enable();
}
/*
@@ -1383,11 +1392,13 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
* write side. As such, rely on RELEASE semantics.
*/
DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
+ preempt_disable();
tmp = atomic_long_fetch_add_release(
-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
rwsem_set_reader_owned(sem);
if (tmp & RWSEM_FLAG_WAITERS)
rwsem_downgrade_wake(sem);
+ preempt_enable();
}
#else /* !CONFIG_PREEMPT_RT */
@@ -1662,6 +1673,12 @@ void down_read_non_owner(struct rw_semaphore *sem)
{
might_sleep();
__down_read(sem);
+ /*
+ * The owner value for a reader-owned lock is mostly for debugging
+ * purpose only and is not critical to the correct functioning of
+ * rwsem. So it is perfectly fine to set it in a preempt-enabled
+ * context here.
+ */
__rwsem_set_reader_owned(sem, NULL);
}
EXPORT_SYMBOL(down_read_non_owner);
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index 4523f99b0358..ab2376a1be88 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -494,7 +494,8 @@ unsigned long module_kallsyms_lookup_name(const char *name)
return ret;
}
-int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+int module_kallsyms_on_each_symbol(const char *modname,
+ int (*fn)(void *, const char *,
struct module *, unsigned long),
void *data)
{
@@ -509,6 +510,9 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
if (mod->state == MODULE_STATE_UNFORMED)
continue;
+ if (modname && strcmp(modname, mod->name))
+ continue;
+
/* Use rcu_dereference_sched() to remain compliant with the sparse tool */
preempt_disable();
kallsyms = rcu_dereference_sched(mod->kallsyms);
@@ -525,6 +529,13 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
if (ret != 0)
goto out;
}
+
+ /*
+ * The given module is found, the subsequent modules do not
+ * need to be compared.
+ */
+ if (modname)
+ break;
}
out:
mutex_unlock(&module_mutex);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 48568a0f5651..d3be89de706d 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -17,6 +17,7 @@
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
+#include <linux/kstrtox.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/elf.h>
@@ -2393,7 +2394,8 @@ static bool finished_loading(const char *name)
sched_annotate_sleep();
mutex_lock(&module_mutex);
mod = find_module_all(name, strlen(name), true);
- ret = !mod || mod->state == MODULE_STATE_LIVE;
+ ret = !mod || mod->state == MODULE_STATE_LIVE
+ || mod->state == MODULE_STATE_GOING;
mutex_unlock(&module_mutex);
return ret;
@@ -2569,20 +2571,35 @@ static int add_unformed_module(struct module *mod)
mod->state = MODULE_STATE_UNFORMED;
-again:
mutex_lock(&module_mutex);
old = find_module_all(mod->name, strlen(mod->name), true);
if (old != NULL) {
- if (old->state != MODULE_STATE_LIVE) {
+ if (old->state == MODULE_STATE_COMING
+ || old->state == MODULE_STATE_UNFORMED) {
/* Wait in case it fails to load. */
mutex_unlock(&module_mutex);
err = wait_event_interruptible(module_wq,
finished_loading(mod->name));
if (err)
goto out_unlocked;
- goto again;
+
+ /* The module might have gone in the meantime. */
+ mutex_lock(&module_mutex);
+ old = find_module_all(mod->name, strlen(mod->name),
+ true);
}
- err = -EEXIST;
+
+ /*
+ * We are here only when the same module was being loaded. Do
+ * not try to load it again right now. It prevents long delays
+ * caused by serialized module load failures. It might happen
+ * when more devices of the same type trigger load of
+ * a particular module.
+ */
+ if (old && old->state == MODULE_STATE_LIVE)
+ err = -EEXIST;
+ else
+ err = -EBUSY;
goto out;
}
mod_update_bounds(mod);
@@ -2659,7 +2676,7 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname,
int ret;
if (strcmp(param, "async_probe") == 0) {
- if (strtobool(val, &mod->async_probe_requested))
+ if (kstrtobool(val, &mod->async_probe_requested))
mod->async_probe_requested = true;
return 0;
}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index ab75637fd904..d353e4b5402d 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -456,7 +456,6 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
-#ifdef CONFIG_SRCU
/*
* SRCU notifier chain routines. Registration and unregistration
* use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -573,8 +572,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
-#endif /* CONFIG_SRCU */
-
static ATOMIC_NOTIFIER_HEAD(die_chain);
int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/panic.c b/kernel/panic.c
index 463c9295bc28..5cfea8302d23 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@
#include <linux/ratelimit.h>
#include <linux/debugfs.h>
#include <linux/sysfs.h>
+#include <linux/context_tracking.h>
#include <trace/events/error_report.h>
#include <asm/sections.h>
@@ -211,9 +212,6 @@ static void panic_print_sys_info(bool console_flush)
return;
}
- if (panic_print & PANIC_PRINT_ALL_CPU_BT)
- trigger_all_cpu_backtrace();
-
if (panic_print & PANIC_PRINT_TASK_INFO)
show_state();
@@ -243,6 +241,30 @@ void check_panic_on_warn(const char *origin)
origin, limit);
}
+/*
+ * Helper that triggers the NMI backtrace (if set in panic_print)
+ * and then performs the secondary CPUs shutdown - we cannot have
+ * the NMI backtrace after the CPUs are off!
+ */
+static void panic_other_cpus_shutdown(bool crash_kexec)
+{
+ if (panic_print & PANIC_PRINT_ALL_CPU_BT)
+ trigger_all_cpu_backtrace();
+
+ /*
+ * Note that smp_send_stop() is the usual SMP shutdown function,
+ * which unfortunately may not be hardened to work in a panic
+ * situation. If we want to do crash dump after notifier calls
+ * and kmsg_dump, we will need architecture dependent extra
+ * bits in addition to stopping other CPUs, hence we rely on
+ * crash_smp_send_stop() for that.
+ */
+ if (!crash_kexec)
+ smp_send_stop();
+ else
+ crash_smp_send_stop();
+}
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -333,23 +355,10 @@ void panic(const char *fmt, ...)
*
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
- if (!_crash_kexec_post_notifiers) {
+ if (!_crash_kexec_post_notifiers)
__crash_kexec(NULL);
- /*
- * Note smp_send_stop is the usual smp shutdown function, which
- * unfortunately means it may not be hardened to work in a
- * panic situation.
- */
- smp_send_stop();
- } else {
- /*
- * If we want to do crash dump after notifier calls and
- * kmsg_dump, we will need architecture dependent extra
- * works in addition to stopping other CPUs.
- */
- crash_smp_send_stop();
- }
+ panic_other_cpus_shutdown(_crash_kexec_post_notifiers);
/*
* Run any panic handlers, including those that might need to
@@ -679,6 +688,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
void warn_slowpath_fmt(const char *file, int line, unsigned taint,
const char *fmt, ...)
{
+ bool rcu = warn_rcu_enter();
struct warn_args args;
pr_warn(CUT_HERE);
@@ -693,11 +703,13 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint,
va_start(args.args, fmt);
__warn(file, line, __builtin_return_address(0), taint, NULL, &args);
va_end(args.args);
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(warn_slowpath_fmt);
#else
void __warn_printk(const char *fmt, ...)
{
+ bool rcu = warn_rcu_enter();
va_list args;
pr_warn(CUT_HERE);
@@ -705,6 +717,7 @@ void __warn_printk(const char *fmt, ...)
va_start(args, fmt);
vprintk(fmt, args);
va_end(args);
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(__warn_printk);
#endif
diff --git a/kernel/params.c b/kernel/params.c
index 14d66070757b..6e34ca89ebae 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -4,6 +4,7 @@
*/
#include <linux/kernel.h>
+#include <linux/kstrtox.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/module.h>
@@ -310,7 +311,7 @@ int param_set_bool(const char *val, const struct kernel_param *kp)
if (!val) val = "1";
/* One of =[yYnN01] */
- return strtobool(val, kp->arg);
+ return kstrtobool(val, kp->arg);
}
EXPORT_SYMBOL(param_set_bool);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index f4f8cb0435b4..46e0d5a3f91f 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,6 +23,7 @@
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/idr.h>
+#include "pid_sysctl.h"
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;
@@ -110,6 +111,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
+ initialize_memfd_noexec_scope(ns);
+
return ns;
out_free_idr:
@@ -244,7 +247,24 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
set_current_state(TASK_INTERRUPTIBLE);
if (pid_ns->pid_allocated == init_pids)
break;
+ /*
+ * Release tasks_rcu_exit_srcu to avoid following deadlock:
+ *
+ * 1) TASK A unshare(CLONE_NEWPID)
+ * 2) TASK A fork() twice -> TASK B (child reaper for new ns)
+ * and TASK C
+ * 3) TASK B exits, kills TASK C, waits for TASK A to reap it
+ * 4) TASK A calls synchronize_rcu_tasks()
+ * -> synchronize_srcu(tasks_rcu_exit_srcu)
+ * 5) *DEADLOCK*
+ *
+ * It is considered safe to release tasks_rcu_exit_srcu here
+ * because we assume the current task can not be concurrently
+ * reaped at this point.
+ */
+ exit_tasks_rcu_stop();
schedule();
+ exit_tasks_rcu_start();
}
__set_current_state(TASK_RUNNING);
@@ -455,6 +475,8 @@ static __init int pid_namespaces_init(void)
#ifdef CONFIG_CHECKPOINT_RESTORE
register_sysctl_paths(kern_path, pid_ns_ctl_table);
#endif
+
+ register_pid_ns_sysctl_table_vm();
return 0;
}
diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h
new file mode 100644
index 000000000000..e22d072e1e24
--- /dev/null
+++ b/kernel/pid_sysctl.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_PID_SYSCTL_H
+#define LINUX_PID_SYSCTL_H
+
+#include <linux/pid_namespace.h>
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns)
+{
+ ns->memfd_noexec_scope =
+ task_active_pid_ns(current)->memfd_noexec_scope;
+}
+
+static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table,
+ int write, void *buf, size_t *lenp, loff_t *ppos)
+{
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ struct ctl_table table_copy;
+
+ if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ table_copy = *table;
+ if (ns != &init_pid_ns)
+ table_copy.data = &ns->memfd_noexec_scope;
+
+ /*
+ * set minimum to current value, the effect is only bigger
+ * value is accepted.
+ */
+ if (*(int *)table_copy.data > *(int *)table_copy.extra1)
+ table_copy.extra1 = table_copy.data;
+
+ return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos);
+}
+
+static struct ctl_table pid_ns_ctl_table_vm[] = {
+ {
+ .procname = "memfd_noexec",
+ .data = &init_pid_ns.memfd_noexec_scope,
+ .maxlen = sizeof(init_pid_ns.memfd_noexec_scope),
+ .mode = 0644,
+ .proc_handler = pid_mfd_noexec_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ { }
+};
+static struct ctl_path vm_path[] = { { .procname = "vm", }, { } };
+static inline void register_pid_ns_sysctl_table_vm(void)
+{
+ register_sysctl_paths(vm_path, pid_ns_ctl_table_vm);
+}
+#else
+static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {}
+static inline void set_memfd_noexec_scope(struct pid_namespace *ns) {}
+static inline void register_pid_ns_sysctl_table_vm(void) {}
+#endif
+
+#endif /* LINUX_PID_SYSCTL_H */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 60a1d3051cc7..4b31629c5be4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -118,7 +118,6 @@ config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
select PM
- select SRCU
config PM_SLEEP_SMP
def_bool y
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index f82111837b8d..7b44f5b89fa1 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -87,10 +87,7 @@ static void em_debug_create_pd(struct device *dev)
static void em_debug_remove_pd(struct device *dev)
{
- struct dentry *debug_dir;
-
- debug_dir = debugfs_lookup(dev_name(dev), rootdir);
- debugfs_remove_recursive(debug_dir);
+ debugfs_lookup_and_remove(dev_name(dev), rootdir);
}
static int __init em_debug_init(void)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 277434b6c0bf..36a1df48280c 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -581,7 +581,7 @@ static int save_image(struct swap_map_handle *handle,
return ret;
}
-/**
+/*
* Structure used for CRC32.
*/
struct crc_data {
@@ -596,7 +596,7 @@ struct crc_data {
unsigned char *unc[LZO_THREADS]; /* uncompressed data */
};
-/**
+/*
* CRC32 update function that runs in its own thread.
*/
static int crc32_threadfn(void *data)
@@ -623,7 +623,7 @@ static int crc32_threadfn(void *data)
}
return 0;
}
-/**
+/*
* Structure used for LZO data compression.
*/
struct cmp_data {
@@ -640,7 +640,7 @@ struct cmp_data {
unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */
};
-/**
+/*
* Compression function that runs in its own thread.
*/
static int lzo_compress_threadfn(void *data)
@@ -948,9 +948,9 @@ out_finish:
return error;
}
-/**
+/*
* The following functions allow us to read data using a swap map
- * in a file-alike way
+ * in a file-like way.
*/
static void release_swap_reader(struct swap_map_handle *handle)
@@ -1107,7 +1107,7 @@ static int load_image(struct swap_map_handle *handle,
return ret;
}
-/**
+/*
* Structure used for LZO data decompression.
*/
struct dec_data {
@@ -1123,7 +1123,7 @@ struct dec_data {
unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
};
-/**
+/*
* Decompression function that runs in its own thread.
*/
static int lzo_decompress_threadfn(void *data)
diff --git a/kernel/printk/index.c b/kernel/printk/index.c
index c85be186a783..a6b27526baaf 100644
--- a/kernel/printk/index.c
+++ b/kernel/printk/index.c
@@ -145,7 +145,7 @@ static void pi_create_file(struct module *mod)
#ifdef CONFIG_MODULES
static void pi_remove_file(struct module *mod)
{
- debugfs_remove(debugfs_lookup(pi_get_module_name(mod), dfs_index));
+ debugfs_lookup_and_remove(pi_get_module_name(mod), dfs_index);
}
static int pi_module_notify(struct notifier_block *nb, unsigned long op,
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index d947ca6c84f9..2a17704136f1 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -14,6 +14,21 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
#ifdef CONFIG_PRINTK
+#ifdef CONFIG_PRINTK_CALLER
+#define PRINTK_PREFIX_MAX 48
+#else
+#define PRINTK_PREFIX_MAX 32
+#endif
+
+/*
+ * the maximum size of a formatted record (i.e. with prefix added
+ * per line and dropped messages or in extended message format)
+ */
+#define PRINTK_MESSAGE_MAX 2048
+
+/* the maximum size allowed to be reserved for a record */
+#define PRINTKRB_RECORD_MAX 1024
+
/* Flags for a single printk record. */
enum printk_info_flags {
LOG_NEWLINE = 2, /* text ended with a newline */
@@ -48,6 +63,10 @@ u16 printk_parse_prefix(const char *text, int *level,
enum printk_info_flags *flags);
#else
+#define PRINTK_PREFIX_MAX 0
+#define PRINTK_MESSAGE_MAX 0
+#define PRINTKRB_RECORD_MAX 0
+
/*
* In !PRINTK builds we still export console_sem
* semaphore and some of console functions (console_unlock()/etc.), so
@@ -58,3 +77,29 @@ u16 printk_parse_prefix(const char *text, int *level,
static inline bool printk_percpu_data_ready(void) { return false; }
#endif /* CONFIG_PRINTK */
+
+/**
+ * struct printk_buffers - Buffers to read/format/output printk messages.
+ * @outbuf: After formatting, contains text to output.
+ * @scratchbuf: Used as temporary ringbuffer reading and string-print space.
+ */
+struct printk_buffers {
+ char outbuf[PRINTK_MESSAGE_MAX];
+ char scratchbuf[PRINTKRB_RECORD_MAX];
+};
+
+/**
+ * struct printk_message - Container for a prepared printk message.
+ * @pbufs: printk buffers used to prepare the message.
+ * @outbuf_len: The length of prepared text in @pbufs->outbuf to output. This
+ * does not count the terminator. A value of 0 means there is
+ * nothing to output and this record should be skipped.
+ * @seq: The sequence number of the record used for @pbufs->outbuf.
+ * @dropped: The number of dropped records from reading @seq.
+ */
+struct printk_message {
+ struct printk_buffers *pbufs;
+ unsigned int outbuf_len;
+ u64 seq;
+ unsigned long dropped;
+};
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a5ed2e53547c..fd0c9f913940 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -466,21 +466,6 @@ static struct latched_seq clear_seq = {
.val[1] = 0,
};
-#ifdef CONFIG_PRINTK_CALLER
-#define PREFIX_MAX 48
-#else
-#define PREFIX_MAX 32
-#endif
-
-/* the maximum size of a formatted record (i.e. with prefix added per line) */
-#define CONSOLE_LOG_MAX 1024
-
-/* the maximum size for a dropped text message */
-#define DROPPED_TEXT_MAX 64
-
-/* the maximum size allowed to be reserved for a record */
-#define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX)
-
#define LOG_LEVEL(v) ((v) & 0x07)
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
@@ -711,16 +696,15 @@ out:
return len;
}
+static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_supress);
+
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
atomic64_t seq;
struct ratelimit_state rs;
struct mutex lock;
- char buf[CONSOLE_EXT_LOG_MAX];
-
- struct printk_info info;
- char text_buf[CONSOLE_EXT_LOG_MAX];
- struct printk_record record;
+ struct printk_buffers pbufs;
};
static __printf(3, 4) __cold
@@ -746,7 +730,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
size_t len = iov_iter_count(from);
ssize_t ret = len;
- if (!user || len > LOG_LINE_MAX)
+ if (!user || len > PRINTKRB_RECORD_MAX)
return -EINVAL;
/* Ignore when user logging is disabled. */
@@ -802,8 +786,10 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct devkmsg_user *user = file->private_data;
- struct printk_record *r = &user->record;
- size_t len;
+ char *outbuf = &user->pbufs.outbuf[0];
+ struct printk_message pmsg = {
+ .pbufs = &user->pbufs,
+ };
ssize_t ret;
if (!user)
@@ -813,7 +799,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
if (ret)
return ret;
- if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) {
+ if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
goto out;
@@ -830,36 +816,31 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
* This pairs with __wake_up_klogd:A.
*/
ret = wait_event_interruptible(log_wait,
- prb_read_valid(prb,
- atomic64_read(&user->seq), r)); /* LMM(devkmsg_read:A) */
+ printk_get_next_message(&pmsg, atomic64_read(&user->seq), true,
+ false)); /* LMM(devkmsg_read:A) */
if (ret)
goto out;
}
- if (r->info->seq != atomic64_read(&user->seq)) {
+ if (pmsg.dropped) {
/* our last seen message is gone, return error and reset */
- atomic64_set(&user->seq, r->info->seq);
+ atomic64_set(&user->seq, pmsg.seq);
ret = -EPIPE;
goto out;
}
- len = info_print_ext_header(user->buf, sizeof(user->buf), r->info);
- len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
- &r->text_buf[0], r->info->text_len,
- &r->info->dev_info);
-
- atomic64_set(&user->seq, r->info->seq + 1);
+ atomic64_set(&user->seq, pmsg.seq + 1);
- if (len > count) {
+ if (pmsg.outbuf_len > count) {
ret = -EINVAL;
goto out;
}
- if (copy_to_user(buf, user->buf, len)) {
+ if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) {
ret = -EFAULT;
goto out;
}
- ret = len;
+ ret = pmsg.outbuf_len;
out:
mutex_unlock(&user->lock);
return ret;
@@ -953,9 +934,6 @@ static int devkmsg_open(struct inode *inode, struct file *file)
mutex_init(&user->lock);
- prb_rec_init_rd(&user->record, &user->info,
- &user->text_buf[0], sizeof(user->text_buf));
-
atomic64_set(&user->seq, prb_first_valid_seq(prb));
file->private_data = user;
@@ -1150,7 +1128,7 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
return prb_record_text_space(&e);
}
-static char setup_text_buf[LOG_LINE_MAX] __initdata;
+static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata;
void __init setup_log_buf(int early)
{
@@ -1416,7 +1394,7 @@ static size_t record_print_text(struct printk_record *r, bool syslog,
size_t text_len = r->info->text_len;
size_t buf_size = r->text_buf_size;
char *text = r->text_buf;
- char prefix[PREFIX_MAX];
+ char prefix[PRINTK_PREFIX_MAX];
bool truncated = false;
size_t prefix_len;
size_t line_len;
@@ -1515,7 +1493,7 @@ static size_t get_record_print_text_size(struct printk_info *info,
unsigned int line_count,
bool syslog, bool time)
{
- char prefix[PREFIX_MAX];
+ char prefix[PRINTK_PREFIX_MAX];
size_t prefix_len;
prefix_len = info_print_prefix(info, syslog, time, prefix);
@@ -1581,11 +1559,11 @@ static int syslog_print(char __user *buf, int size)
int len = 0;
u64 seq;
- text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
- prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
+ prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);
mutex_lock(&syslog_lock);
@@ -1686,7 +1664,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
u64 seq;
bool time;
- text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
@@ -1698,7 +1676,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1,
size, true, time);
- prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
+ prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);
len = 0;
prb_for_each_record(seq, prb, seq, &r) {
@@ -2013,27 +1991,6 @@ static int console_trylock_spinning(void)
}
/*
- * Call the specified console driver, asking it to write out the specified
- * text and length. If @dropped_text is non-NULL and any records have been
- * dropped, a dropped message will be written out first.
- */
-static void call_console_driver(struct console *con, const char *text, size_t len,
- char *dropped_text)
-{
- size_t dropped_len;
-
- if (con->dropped && dropped_text) {
- dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX,
- "** %lu printk messages dropped **\n",
- con->dropped);
- con->dropped = 0;
- con->write(con, dropped_text, dropped_len);
- }
-
- con->write(con, text, len);
-}
-
-/*
* Recursion is tracked separately on each CPU. If NMIs are supported, an
* additional NMI context per CPU is also separately tracked. Until per-CPU
* is available, a separate "early tracking" is performed.
@@ -2196,7 +2153,7 @@ static u16 printk_sprint(char *text, u16 size, int facility,
}
}
- trace_console_rcuidle(text, text_len);
+ trace_console(text, text_len);
return text_len;
}
@@ -2243,8 +2200,8 @@ int vprintk_store(int facility, int level,
reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1;
va_end(args2);
- if (reserve_size > LOG_LINE_MAX)
- reserve_size = LOG_LINE_MAX;
+ if (reserve_size > PRINTKRB_RECORD_MAX)
+ reserve_size = PRINTKRB_RECORD_MAX;
/* Extract log level or control flags. */
if (facility == 0)
@@ -2258,7 +2215,7 @@ int vprintk_store(int facility, int level,
if (flags & LOG_CONT) {
prb_rec_init_wr(&r, reserve_size);
- if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) {
+ if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) {
text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
facility, &flags, fmt, args);
r.info->text_len += text_len;
@@ -2389,8 +2346,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
#else /* CONFIG_PRINTK */
-#define CONSOLE_LOG_MAX 0
-#define DROPPED_TEXT_MAX 0
#define printk_time false
#define prb_read_valid(rb, seq, r) false
@@ -2414,10 +2369,6 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
struct dev_printk_info *dev_info) { return 0; }
static void console_lock_spinning_enable(void) { }
static int console_lock_spinning_disable_and_check(int cookie) { return 0; }
-static void call_console_driver(struct console *con, const char *text, size_t len,
- char *dropped_text)
-{
-}
static bool suppress_message_printing(int level) { return false; }
static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; }
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
@@ -2744,16 +2695,136 @@ static void __console_unlock(void)
}
/*
- * Print one record for the given console. The record printed is whatever
- * record is the next available record for the given console.
+ * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This
+ * is achieved by shifting the existing message over and inserting the dropped
+ * message.
+ *
+ * @pmsg is the printk message to prepend.
*
- * @text is a buffer of size CONSOLE_LOG_MAX.
+ * @dropped is the dropped count to report in the dropped message.
*
- * If extended messages should be printed, @ext_text is a buffer of size
- * CONSOLE_EXT_LOG_MAX. Otherwise @ext_text must be NULL.
+ * If the message text in @pmsg->pbufs->outbuf does not have enough space for
+ * the dropped message, the message text will be sufficiently truncated.
*
- * If dropped messages should be printed, @dropped_text is a buffer of size
- * DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL.
+ * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
+ */
+#ifdef CONFIG_PRINTK
+static void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
+{
+ struct printk_buffers *pbufs = pmsg->pbufs;
+ const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
+ const size_t outbuf_sz = sizeof(pbufs->outbuf);
+ char *scratchbuf = &pbufs->scratchbuf[0];
+ char *outbuf = &pbufs->outbuf[0];
+ size_t len;
+
+ len = scnprintf(scratchbuf, scratchbuf_sz,
+ "** %lu printk messages dropped **\n", dropped);
+
+ /*
+ * Make sure outbuf is sufficiently large before prepending.
+ * Keep at least the prefix when the message must be truncated.
+ * It is a rather theoretical problem when someone tries to
+ * use a minimalist buffer.
+ */
+ if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz))
+ return;
+
+ if (pmsg->outbuf_len + len >= outbuf_sz) {
+ /* Truncate the message, but keep it terminated. */
+ pmsg->outbuf_len = outbuf_sz - (len + 1);
+ outbuf[pmsg->outbuf_len] = 0;
+ }
+
+ memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1);
+ memcpy(outbuf, scratchbuf, len);
+ pmsg->outbuf_len += len;
+}
+#else
+#define console_prepend_dropped(pmsg, dropped)
+#endif /* CONFIG_PRINTK */
+
+/*
+ * Read and format the specified record (or a later record if the specified
+ * record is not available).
+ *
+ * @pmsg will contain the formatted result. @pmsg->pbufs must point to a
+ * struct printk_buffers.
+ *
+ * @seq is the record to read and format. If it is not available, the next
+ * valid record is read.
+ *
+ * @is_extended specifies if the message should be formatted for extended
+ * console output.
+ *
+ * @may_supress specifies if records may be skipped based on loglevel.
+ *
+ * Returns false if no record is available. Otherwise true and all fields
+ * of @pmsg are valid. (See the documentation of struct printk_message
+ * for information about the @pmsg fields.)
+ */
+static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_suppress)
+{
+ static int panic_console_dropped;
+
+ struct printk_buffers *pbufs = pmsg->pbufs;
+ const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
+ const size_t outbuf_sz = sizeof(pbufs->outbuf);
+ char *scratchbuf = &pbufs->scratchbuf[0];
+ char *outbuf = &pbufs->outbuf[0];
+ struct printk_info info;
+ struct printk_record r;
+ size_t len = 0;
+
+ /*
+ * Formatting extended messages requires a separate buffer, so use the
+ * scratch buffer to read in the ringbuffer text.
+ *
+ * Formatting normal messages is done in-place, so read the ringbuffer
+ * text directly into the output buffer.
+ */
+ if (is_extended)
+ prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz);
+ else
+ prb_rec_init_rd(&r, &info, outbuf, outbuf_sz);
+
+ if (!prb_read_valid(prb, seq, &r))
+ return false;
+
+ pmsg->seq = r.info->seq;
+ pmsg->dropped = r.info->seq - seq;
+
+ /*
+ * Check for dropped messages in panic here so that printk
+ * suppression can occur as early as possible if necessary.
+ */
+ if (pmsg->dropped &&
+ panic_in_progress() &&
+ panic_console_dropped++ > 10) {
+ suppress_panic_printk = 1;
+ pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
+ }
+
+ /* Skip record that has level above the console loglevel. */
+ if (may_suppress && suppress_message_printing(r.info->level))
+ goto out;
+
+ if (is_extended) {
+ len = info_print_ext_header(outbuf, outbuf_sz, r.info);
+ len += msg_print_ext_body(outbuf + len, outbuf_sz - len,
+ &r.text_buf[0], r.info->text_len, &r.info->dev_info);
+ } else {
+ len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
+ }
+out:
+ pmsg->outbuf_len = len;
+ return true;
+}
+
+/*
+ * Print one record for the given console. The record printed is whatever
+ * record is the next available record for the given console.
*
* @handover will be set to true if a printk waiter has taken over the
* console_lock, in which case the caller is no longer holding both the
@@ -2766,46 +2837,33 @@ static void __console_unlock(void)
*
* Requires the console_lock and the SRCU read lock.
*/
-static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
- char *dropped_text, bool *handover, int cookie)
+static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
{
- static int panic_console_dropped;
- struct printk_info info;
- struct printk_record r;
- unsigned long flags;
- char *write_text;
- size_t len;
+ static struct printk_buffers pbufs;
- prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
+ bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
+ char *outbuf = &pbufs.outbuf[0];
+ struct printk_message pmsg = {
+ .pbufs = &pbufs,
+ };
+ unsigned long flags;
*handover = false;
- if (!prb_read_valid(prb, con->seq, &r))
+ if (!printk_get_next_message(&pmsg, con->seq, is_extended, true))
return false;
- if (con->seq != r.info->seq) {
- con->dropped += r.info->seq - con->seq;
- con->seq = r.info->seq;
- if (panic_in_progress() && panic_console_dropped++ > 10) {
- suppress_panic_printk = 1;
- pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
- }
- }
+ con->dropped += pmsg.dropped;
- /* Skip record that has level above the console loglevel. */
- if (suppress_message_printing(r.info->level)) {
- con->seq++;
+ /* Skip messages of formatted length 0. */
+ if (pmsg.outbuf_len == 0) {
+ con->seq = pmsg.seq + 1;
goto skip;
}
- if (ext_text) {
- write_text = ext_text;
- len = info_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX, r.info);
- len += msg_print_ext_body(ext_text + len, CONSOLE_EXT_LOG_MAX - len,
- &r.text_buf[0], r.info->text_len, &r.info->dev_info);
- } else {
- write_text = text;
- len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
+ if (con->dropped && !is_extended) {
+ console_prepend_dropped(&pmsg, con->dropped);
+ con->dropped = 0;
}
/*
@@ -2821,11 +2879,15 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
printk_safe_enter_irqsave(flags);
console_lock_spinning_enable();
- stop_critical_timings(); /* don't trace print latency */
- call_console_driver(con, write_text, len, dropped_text);
+ /* Do not trace print latency. */
+ stop_critical_timings();
+
+ /* Write everything out to the hardware. */
+ con->write(con, outbuf, pmsg.outbuf_len);
+
start_critical_timings();
- con->seq++;
+ con->seq = pmsg.seq + 1;
*handover = console_lock_spinning_disable_and_check(cookie);
printk_safe_exit_irqrestore(flags);
@@ -2858,9 +2920,6 @@ skip:
*/
static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
{
- static char dropped_text[DROPPED_TEXT_MAX];
- static char ext_text[CONSOLE_EXT_LOG_MAX];
- static char text[CONSOLE_LOG_MAX];
bool any_usable = false;
struct console *con;
bool any_progress;
@@ -2880,16 +2939,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
continue;
any_usable = true;
- if (console_srcu_read_flags(con) & CON_EXTENDED) {
- /* Extended consoles do not print "dropped messages". */
- progress = console_emit_next_record(con, &text[0],
- &ext_text[0], NULL,
- handover, cookie);
- } else {
- progress = console_emit_next_record(con, &text[0],
- NULL, &dropped_text[0],
- handover, cookie);
- }
+ progress = console_emit_next_record(con, handover, cookie);
/*
* If a handover has occurred, the SRCU read lock
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 54482193e1ed..0786450074c1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -813,7 +813,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
{
struct ptrace_rseq_configuration conf = {
.rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
- .rseq_abi_size = sizeof(*task->rseq),
+ .rseq_abi_size = task->rseq_len,
.signature = task->rseq_sig,
.flags = 0,
};
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 232e29fe3e5e..2984de629f74 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -82,7 +82,7 @@ config RCU_CPU_STALL_TIMEOUT
config RCU_EXP_CPU_STALL_TIMEOUT
int "Expedited RCU CPU stall timeout in milliseconds"
depends on RCU_STALL_COMMON
- range 0 21000
+ range 0 300000
default 0
help
If a given expedited RCU grace period extends more than the
@@ -92,6 +92,19 @@ config RCU_EXP_CPU_STALL_TIMEOUT
says to use the RCU_CPU_STALL_TIMEOUT value converted from
seconds to milliseconds.
+config RCU_CPU_STALL_CPUTIME
+ bool "Provide additional RCU stall debug information"
+ depends on RCU_STALL_COMMON
+ default n
+ help
+ Collect statistics during the sampling period, such as the number of
+ (hard interrupts, soft interrupts, task switches) and the cputime of
+ (hard interrupts, soft interrupts, kernel tasks) are added to the
+ RCU stall report. For multiple continuous RCU stalls, all sampling
+ periods begin at half of the first RCU stall timeout.
+ The boot option rcupdate.rcu_cpu_stall_cputime has the same function
+ as this one, but will override this if it exists.
+
config RCU_TRACE
bool "Enable tracing for RCU"
depends on DEBUG_KERNEL
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index c5aa934de59b..115616ac3bfa 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -224,6 +224,8 @@ extern int rcu_cpu_stall_ftrace_dump;
extern int rcu_cpu_stall_suppress;
extern int rcu_cpu_stall_timeout;
extern int rcu_exp_cpu_stall_timeout;
+extern int rcu_cpu_stall_cputime;
+extern bool rcu_exp_stall_task_details __read_mostly;
int rcu_jiffies_till_stall_check(void);
int rcu_exp_jiffies_till_stall_check(void);
@@ -447,14 +449,20 @@ do { \
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
static inline bool rcu_gp_is_normal(void) { return true; }
static inline bool rcu_gp_is_expedited(void) { return false; }
+static inline bool rcu_async_should_hurry(void) { return false; }
static inline void rcu_expedite_gp(void) { }
static inline void rcu_unexpedite_gp(void) { }
+static inline void rcu_async_hurry(void) { }
+static inline void rcu_async_relax(void) { }
static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
+bool rcu_async_should_hurry(void); /* Internal RCU use. */
void rcu_expedite_gp(void);
void rcu_unexpedite_gp(void);
+void rcu_async_hurry(void);
+void rcu_async_relax(void);
void rcupdate_announce_bootup_oddness(void);
#ifdef CONFIG_TASKS_RCU_GENERIC
void show_rcu_tasks_gp_kthreads(void);
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index c54ea2b6a36b..f71fac422c8f 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -89,7 +89,7 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
}
/* Get the length of a segment of the rcu_segcblist structure. */
-static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
+long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
{
return READ_ONCE(rsclp->seglen[seg]);
}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 431cee212467..4fe877f5f654 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -15,6 +15,8 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
return READ_ONCE(rclp->len);
}
+long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg);
+
/* Return number of callbacks in segmented callback list by summing seglen. */
long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 634df26a2c27..8e6c023212cb 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -399,7 +399,7 @@ static int torture_readlock_not_held(void)
return rcu_read_lock_bh_held() || rcu_read_lock_sched_held();
}
-static int rcu_torture_read_lock(void) __acquires(RCU)
+static int rcu_torture_read_lock(void)
{
rcu_read_lock();
return 0;
@@ -441,7 +441,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
}
}
-static void rcu_torture_read_unlock(int idx) __releases(RCU)
+static void rcu_torture_read_unlock(int idx)
{
rcu_read_unlock();
}
@@ -625,7 +625,7 @@ static struct srcu_struct srcu_ctld;
static struct srcu_struct *srcu_ctlp = &srcu_ctl;
static struct rcu_torture_ops srcud_ops;
-static int srcu_torture_read_lock(void) __acquires(srcu_ctlp)
+static int srcu_torture_read_lock(void)
{
if (cur_ops == &srcud_ops)
return srcu_read_lock_nmisafe(srcu_ctlp);
@@ -652,7 +652,7 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
}
}
-static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
+static void srcu_torture_read_unlock(int idx)
{
if (cur_ops == &srcud_ops)
srcu_read_unlock_nmisafe(srcu_ctlp, idx);
@@ -814,13 +814,13 @@ static void synchronize_rcu_trivial(void)
}
}
-static int rcu_torture_read_lock_trivial(void) __acquires(RCU)
+static int rcu_torture_read_lock_trivial(void)
{
preempt_disable();
return 0;
}
-static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU)
+static void rcu_torture_read_unlock_trivial(int idx)
{
preempt_enable();
}
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 435c884c02b5..afa3e1a2f690 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -76,6 +76,8 @@ torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s");
// Wait until there are multiple CPUs before starting test.
torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
"Holdoff time before test start (s)");
+// Number of typesafe_lookup structures, that is, the degree of concurrency.
+torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures.");
// Number of loops per experiment, all readers execute operations concurrently.
torture_param(long, loops, 10000, "Number of loops per experiment.");
// Number of readers, with -1 defaulting to about 75% of the CPUs.
@@ -124,7 +126,7 @@ static int exp_idx;
// Operations vector for selecting different types of tests.
struct ref_scale_ops {
- void (*init)(void);
+ bool (*init)(void);
void (*cleanup)(void);
void (*readsection)(const int nloops);
void (*delaysection)(const int nloops, const int udl, const int ndl);
@@ -162,8 +164,9 @@ static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl
}
}
-static void rcu_sync_scale_init(void)
+static bool rcu_sync_scale_init(void)
{
+ return true;
}
static struct ref_scale_ops rcu_ops = {
@@ -315,9 +318,10 @@ static struct ref_scale_ops refcnt_ops = {
// Definitions for rwlock
static rwlock_t test_rwlock;
-static void ref_rwlock_init(void)
+static bool ref_rwlock_init(void)
{
rwlock_init(&test_rwlock);
+ return true;
}
static void ref_rwlock_section(const int nloops)
@@ -351,9 +355,10 @@ static struct ref_scale_ops rwlock_ops = {
// Definitions for rwsem
static struct rw_semaphore test_rwsem;
-static void ref_rwsem_init(void)
+static bool ref_rwsem_init(void)
{
init_rwsem(&test_rwsem);
+ return true;
}
static void ref_rwsem_section(const int nloops)
@@ -523,6 +528,237 @@ static struct ref_scale_ops clock_ops = {
.name = "clock"
};
+////////////////////////////////////////////////////////////////////////
+//
+// Methods leveraging SLAB_TYPESAFE_BY_RCU.
+//
+
+// Item to look up in a typesafe manner. Array of pointers to these.
+struct refscale_typesafe {
+ atomic_t rts_refctr; // Used by all flavors
+ spinlock_t rts_lock;
+ seqlock_t rts_seqlock;
+ unsigned int a;
+ unsigned int b;
+};
+
+static struct kmem_cache *typesafe_kmem_cachep;
+static struct refscale_typesafe **rtsarray;
+static long rtsarray_size;
+static DEFINE_TORTURE_RANDOM_PERCPU(refscale_rand);
+static bool (*rts_acquire)(struct refscale_typesafe *rtsp, unsigned int *start);
+static bool (*rts_release)(struct refscale_typesafe *rtsp, unsigned int start);
+
+// Conditionally acquire an explicit in-structure reference count.
+static bool typesafe_ref_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ return atomic_inc_not_zero(&rtsp->rts_refctr);
+}
+
+// Unconditionally release an explicit in-structure reference count.
+static bool typesafe_ref_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ if (!atomic_dec_return(&rtsp->rts_refctr)) {
+ WRITE_ONCE(rtsp->a, rtsp->a + 1);
+ kmem_cache_free(typesafe_kmem_cachep, rtsp);
+ }
+ return true;
+}
+
+// Unconditionally acquire an explicit in-structure spinlock.
+static bool typesafe_lock_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ spin_lock(&rtsp->rts_lock);
+ return true;
+}
+
+// Unconditionally release an explicit in-structure spinlock.
+static bool typesafe_lock_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ spin_unlock(&rtsp->rts_lock);
+ return true;
+}
+
+// Unconditionally acquire an explicit in-structure sequence lock.
+static bool typesafe_seqlock_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ *start = read_seqbegin(&rtsp->rts_seqlock);
+ return true;
+}
+
+// Conditionally release an explicit in-structure sequence lock. Return
+// true if this release was successful, that is, if no retry is required.
+static bool typesafe_seqlock_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ return !read_seqretry(&rtsp->rts_seqlock, start);
+}
+
+// Do a read-side critical section with the specified delay in
+// microseconds and nanoseconds inserted so as to increase probability
+// of failure.
+static void typesafe_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned int a;
+ unsigned int b;
+ int i;
+ long idx;
+ struct refscale_typesafe *rtsp;
+ unsigned int start;
+
+ for (i = nloops; i >= 0; i--) {
+ preempt_disable();
+ idx = torture_random(this_cpu_ptr(&refscale_rand)) % rtsarray_size;
+ preempt_enable();
+retry:
+ rcu_read_lock();
+ rtsp = rcu_dereference(rtsarray[idx]);
+ a = READ_ONCE(rtsp->a);
+ if (!rts_acquire(rtsp, &start)) {
+ rcu_read_unlock();
+ goto retry;
+ }
+ if (a != READ_ONCE(rtsp->a)) {
+ (void)rts_release(rtsp, start);
+ rcu_read_unlock();
+ goto retry;
+ }
+ un_delay(udl, ndl);
+ // Remember, seqlock read-side release can fail.
+ if (!rts_release(rtsp, start)) {
+ rcu_read_unlock();
+ goto retry;
+ }
+ b = READ_ONCE(rtsp->a);
+ WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b);
+ b = rtsp->b;
+ rcu_read_unlock();
+ WARN_ON_ONCE(a * a != b);
+ }
+}
+
+// Because the acquisition and release methods are expensive, there
+// is no point in optimizing away the un_delay() function's two checks.
+// Thus simply define typesafe_read_section() as a simple wrapper around
+// typesafe_delay_section().
+static void typesafe_read_section(const int nloops)
+{
+ typesafe_delay_section(nloops, 0, 0);
+}
+
+// Allocate and initialize one refscale_typesafe structure.
+static struct refscale_typesafe *typesafe_alloc_one(void)
+{
+ struct refscale_typesafe *rtsp;
+
+ rtsp = kmem_cache_alloc(typesafe_kmem_cachep, GFP_KERNEL);
+ if (!rtsp)
+ return NULL;
+ atomic_set(&rtsp->rts_refctr, 1);
+ WRITE_ONCE(rtsp->a, rtsp->a + 1);
+ WRITE_ONCE(rtsp->b, rtsp->a * rtsp->a);
+ return rtsp;
+}
+
+// Slab-allocator constructor for refscale_typesafe structures created
+// out of a new slab of system memory.
+static void refscale_typesafe_ctor(void *rtsp_in)
+{
+ struct refscale_typesafe *rtsp = rtsp_in;
+
+ spin_lock_init(&rtsp->rts_lock);
+ seqlock_init(&rtsp->rts_seqlock);
+ preempt_disable();
+ rtsp->a = torture_random(this_cpu_ptr(&refscale_rand));
+ preempt_enable();
+}
+
+static struct ref_scale_ops typesafe_ref_ops;
+static struct ref_scale_ops typesafe_lock_ops;
+static struct ref_scale_ops typesafe_seqlock_ops;
+
+// Initialize for a typesafe test.
+static bool typesafe_init(void)
+{
+ long idx;
+ long si = lookup_instances;
+
+ typesafe_kmem_cachep = kmem_cache_create("refscale_typesafe",
+ sizeof(struct refscale_typesafe), sizeof(void *),
+ SLAB_TYPESAFE_BY_RCU, refscale_typesafe_ctor);
+ if (!typesafe_kmem_cachep)
+ return false;
+ if (si < 0)
+ si = -si * nr_cpu_ids;
+ else if (si == 0)
+ si = nr_cpu_ids;
+ rtsarray_size = si;
+ rtsarray = kcalloc(si, sizeof(*rtsarray), GFP_KERNEL);
+ if (!rtsarray)
+ return false;
+ for (idx = 0; idx < rtsarray_size; idx++) {
+ rtsarray[idx] = typesafe_alloc_one();
+ if (!rtsarray[idx])
+ return false;
+ }
+ if (cur_ops == &typesafe_ref_ops) {
+ rts_acquire = typesafe_ref_acquire;
+ rts_release = typesafe_ref_release;
+ } else if (cur_ops == &typesafe_lock_ops) {
+ rts_acquire = typesafe_lock_acquire;
+ rts_release = typesafe_lock_release;
+ } else if (cur_ops == &typesafe_seqlock_ops) {
+ rts_acquire = typesafe_seqlock_acquire;
+ rts_release = typesafe_seqlock_release;
+ } else {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+ return true;
+}
+
+// Clean up after a typesafe test.
+static void typesafe_cleanup(void)
+{
+ long idx;
+
+ if (rtsarray) {
+ for (idx = 0; idx < rtsarray_size; idx++)
+ kmem_cache_free(typesafe_kmem_cachep, rtsarray[idx]);
+ kfree(rtsarray);
+ rtsarray = NULL;
+ rtsarray_size = 0;
+ }
+ kmem_cache_destroy(typesafe_kmem_cachep);
+ typesafe_kmem_cachep = NULL;
+ rts_acquire = NULL;
+ rts_release = NULL;
+}
+
+// The typesafe_init() function distinguishes these structures by address.
+static struct ref_scale_ops typesafe_ref_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_ref"
+};
+
+static struct ref_scale_ops typesafe_lock_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_lock"
+};
+
+static struct ref_scale_ops typesafe_seqlock_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_seqlock"
+};
+
static void rcu_scale_one_reader(void)
{
if (readdelay <= 0)
@@ -812,6 +1048,7 @@ ref_scale_init(void)
static struct ref_scale_ops *scale_ops[] = {
&rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
&rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops,
+ &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
};
if (!torture_init_begin(scale_type, verbose))
@@ -833,7 +1070,10 @@ ref_scale_init(void)
goto unwind;
}
if (cur_ops->init)
- cur_ops->init();
+ if (!cur_ops->init()) {
+ firsterr = -EUCLEAN;
+ goto unwind;
+ }
ref_scale_print_module_parms(cur_ops, "Start of test");
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index ca4b5dcec675..ab4ee58af84b 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -154,7 +154,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
*/
static inline bool srcu_invl_snp_seq(unsigned long s)
{
- return rcu_seq_state(s) == SRCU_SNP_INIT_SEQ;
+ return s == SRCU_SNP_INIT_SEQ;
}
/*
@@ -469,24 +469,59 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
/*
* If the locks are the same as the unlocks, then there must have
- * been no readers on this index at some time in between. This does
- * not mean that there are no more readers, as one could have read
- * the current index but not have incremented the lock counter yet.
+ * been no readers on this index at some point in this function.
+ * But there might be more readers, as a task might have read
+ * the current ->srcu_idx but not yet have incremented its CPU's
+ * ->srcu_lock_count[idx] counter. In fact, it is possible
+ * that most of the tasks have been preempted between fetching
+ * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there
+ * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks
+ * in a system whose address space was fully populated with memory.
+ * Call this quantity Nt.
*
- * So suppose that the updater is preempted here for so long
- * that more than ULONG_MAX non-nested readers come and go in
- * the meantime. It turns out that this cannot result in overflow
- * because if a reader modifies its unlock count after we read it
- * above, then that reader's next load of ->srcu_idx is guaranteed
- * to get the new value, which will cause it to operate on the
- * other bank of counters, where it cannot contribute to the
- * overflow of these counters. This means that there is a maximum
- * of 2*NR_CPUS increments, which cannot overflow given current
- * systems, especially not on 64-bit systems.
+ * So suppose that the updater is preempted at this point in the
+ * code for a long time. That now-preempted updater has already
+ * flipped ->srcu_idx (possibly during the preceding grace period),
+ * done an smp_mb() (again, possibly during the preceding grace
+ * period), and summed up the ->srcu_unlock_count[idx] counters.
+ * How many times can a given one of the aforementioned Nt tasks
+ * increment the old ->srcu_idx value's ->srcu_lock_count[idx]
+ * counter, in the absence of nesting?
*
- * OK, how about nesting? This does impose a limit on nesting
- * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient,
- * especially on 64-bit systems.
+ * It can clearly do so once, given that it has already fetched
+ * the old value of ->srcu_idx and is just about to use that value
+ * to index its increment of ->srcu_lock_count[idx]. But as soon as
+ * it leaves that SRCU read-side critical section, it will increment
+ * ->srcu_unlock_count[idx], which must follow the updater's above
+ * read from that same value. Thus, as soon the reading task does
+ * an smp_mb() and a later fetch from ->srcu_idx, that task will be
+ * guaranteed to get the new index. Except that the increment of
+ * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the
+ * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock()
+ * is before the smp_mb(). Thus, that task might not see the new
+ * value of ->srcu_idx until the -second- __srcu_read_lock(),
+ * which in turn means that this task might well increment
+ * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice,
+ * not just once.
+ *
+ * However, it is important to note that a given smp_mb() takes
+ * effect not just for the task executing it, but also for any
+ * later task running on that same CPU.
+ *
+ * That is, there can be almost Nt + Nc further increments of
+ * ->srcu_lock_count[idx] for the old index, where Nc is the number
+ * of CPUs. But this is OK because the size of the task_struct
+ * structure limits the value of Nt and current systems limit Nc
+ * to a few thousand.
+ *
+ * OK, but what about nesting? This does impose a limit on
+ * nesting of half of the size of the task_struct structure
+ * (measured in bytes), which should be sufficient. A late 2022
+ * TREE01 rcutorture run reported this size to be no less than
+ * 9408 bytes, allowing up to 4704 levels of nesting, which is
+ * comfortably beyond excessive. Especially on 64-bit systems,
+ * which are unlikely to be configured with an address space fully
+ * populated with memory, at least not anytime soon.
*/
return srcu_readers_lock_idx(ssp, idx) == unlocks;
}
@@ -726,7 +761,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
int state;
if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
- sdp = per_cpu_ptr(ssp->sda, 0);
+ sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
else
sdp = this_cpu_ptr(ssp->sda);
lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
@@ -837,7 +872,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
/* Initiate callback invocation as needed. */
ss_state = smp_load_acquire(&ssp->srcu_size_state);
if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
- srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
+ srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()),
+ cbdelay);
} else {
idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
srcu_for_each_node_breadth_first(ssp, snp) {
@@ -914,7 +950,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
if (snp)
for (; snp != NULL; snp = snp->srcu_parent) {
sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp);
- if (rcu_seq_done(&ssp->srcu_gp_seq, s) ||
+ if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) ||
(!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))
return;
spin_lock_irqsave_rcu_node(snp, flags);
@@ -941,6 +977,9 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
*
* Note that this function also does the work of srcu_funnel_exp_start(),
* in some cases by directly invoking it.
+ *
+ * The srcu read lock should be hold around this function. And s is a seq snap
+ * after holding that lock.
*/
static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
unsigned long s, bool do_norm)
@@ -961,7 +1000,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
if (snp_leaf)
/* Each pass through the loop does one level of the srcu_node tree. */
for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
- if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf)
+ if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && snp != snp_leaf)
return; /* GP already done and CBs recorded. */
spin_lock_irqsave_rcu_node(snp, flags);
snp_seq = snp->srcu_have_cbs[idx];
@@ -998,8 +1037,8 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
- /* If grace period not already done and none in progress, start it. */
- if (!rcu_seq_done(&ssp->srcu_gp_seq, s) &&
+ /* If grace period not already in progress, start it. */
+ if (!WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) &&
rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) {
WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
srcu_gp_start(ssp);
@@ -1059,10 +1098,11 @@ static void srcu_flip(struct srcu_struct *ssp)
/*
* Ensure that if the updater misses an __srcu_read_unlock()
- * increment, that task's next __srcu_read_lock() will see the
- * above counter update. Note that both this memory barrier
- * and the one in srcu_readers_active_idx_check() provide the
- * guarantee for __srcu_read_lock().
+ * increment, that task's __srcu_read_lock() following its next
+ * __srcu_read_lock() or __srcu_read_unlock() will see the above
+ * counter update. Note that both this memory barrier and the
+ * one in srcu_readers_active_idx_check() provide the guarantee
+ * for __srcu_read_lock().
*/
smp_mb(); /* D */ /* Pairs with C. */
}
@@ -1161,7 +1201,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
idx = __srcu_read_lock_nmisafe(ssp);
ss_state = smp_load_acquire(&ssp->srcu_size_state);
if (ss_state < SRCU_SIZE_WAIT_CALL)
- sdp = per_cpu_ptr(ssp->sda, 0);
+ sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
else
sdp = raw_cpu_ptr(ssp->sda);
spin_lock_irqsave_sdp_contention(sdp, &flags);
@@ -1497,7 +1537,7 @@ void srcu_barrier(struct srcu_struct *ssp)
idx = __srcu_read_lock_nmisafe(ssp);
if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
- srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
+ srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id()));
else
for_each_possible_cpu(cpu)
srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index fe9840d90e96..bfb5e1549f2b 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -384,6 +384,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
{
int cpu;
unsigned long flags;
+ bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq);
long n;
long ncbs = 0;
long ncbsnz = 0;
@@ -425,21 +426,23 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids));
smp_store_release(&rtp->percpu_enqueue_lim, 1);
rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu();
+ gpdone = false;
pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name);
}
raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
}
- if (rcu_task_cb_adjust && !ncbsnz &&
- poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq)) {
+ if (rcu_task_cb_adjust && !ncbsnz && gpdone) {
raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) {
WRITE_ONCE(rtp->percpu_dequeue_lim, 1);
pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
}
- for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) {
- struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+ if (rtp->percpu_dequeue_lim == 1) {
+ for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
- WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist));
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist));
+ }
}
raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
}
@@ -560,8 +563,9 @@ static int __noreturn rcu_tasks_kthread(void *arg)
static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
{
/* Complain if the scheduler has not started. */
- WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
- "synchronize_rcu_tasks called too soon");
+ if (WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
+ "synchronize_%s() called too soon", rtp->name))
+ return;
// If the grace-period kthread is running, use it.
if (READ_ONCE(rtp->kthread_ptr)) {
@@ -827,11 +831,21 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
static void rcu_tasks_postscan(struct list_head *hop)
{
/*
- * Wait for tasks that are in the process of exiting. This
- * does only part of the job, ensuring that all tasks that were
- * previously exiting reach the point where they have disabled
- * preemption, allowing the later synchronize_rcu() to finish
- * the job.
+ * Exiting tasks may escape the tasklist scan. Those are vulnerable
+ * until their final schedule() with TASK_DEAD state. To cope with
+ * this, divide the fragile exit path part in two intersecting
+ * read side critical sections:
+ *
+ * 1) An _SRCU_ read side starting before calling exit_notify(),
+ * which may remove the task from the tasklist, and ending after
+ * the final preempt_disable() call in do_exit().
+ *
+ * 2) An _RCU_ read side starting with the final preempt_disable()
+ * call in do_exit() and ending with the final call to schedule()
+ * with TASK_DEAD state.
+ *
+ * This handles the part 1). And postgp will handle part 2) with a
+ * call to synchronize_rcu().
*/
synchronize_srcu(&tasks_rcu_exit_srcu);
}
@@ -898,7 +912,10 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
*
* In addition, this synchronize_rcu() waits for exiting tasks
* to complete their final preempt_disable() region of execution,
- * cleaning up after the synchronize_srcu() above.
+ * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu),
+ * enforcing the whole region before tasklist removal until
+ * the final schedule() with TASK_DEAD state to be an RCU TASKS
+ * read side critical section.
*/
synchronize_rcu();
}
@@ -988,27 +1005,42 @@ void show_rcu_tasks_classic_gp_kthread(void)
EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread);
#endif // !defined(CONFIG_TINY_RCU)
-/* Do the srcu_read_lock() for the above synchronize_srcu(). */
+/*
+ * Contribute to protect against tasklist scan blind spot while the
+ * task is exiting and may be removed from the tasklist. See
+ * corresponding synchronize_srcu() for further details.
+ */
void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
{
- preempt_disable();
current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
- preempt_enable();
}
-/* Do the srcu_read_unlock() for the above synchronize_srcu(). */
-void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu)
+/*
+ * Contribute to protect against tasklist scan blind spot while the
+ * task is exiting and may be removed from the tasklist. See
+ * corresponding synchronize_srcu() for further details.
+ */
+void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu)
{
struct task_struct *t = current;
- preempt_disable();
__srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx);
- preempt_enable();
- exit_tasks_rcu_finish_trace(t);
+}
+
+/*
+ * Contribute to protect against tasklist scan blind spot while the
+ * task is exiting and may be removed from the tasklist. See
+ * corresponding synchronize_srcu() for further details.
+ */
+void exit_tasks_rcu_finish(void)
+{
+ exit_tasks_rcu_stop();
+ exit_tasks_rcu_finish_trace(current);
}
#else /* #ifdef CONFIG_TASKS_RCU */
void exit_tasks_rcu_start(void) { }
+void exit_tasks_rcu_stop(void) { }
void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
#endif /* #else #ifdef CONFIG_TASKS_RCU */
@@ -1036,9 +1068,6 @@ static void rcu_tasks_be_rude(struct work_struct *work)
// Wait for one rude RCU-tasks grace period.
static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp)
{
- if (num_online_cpus() <= 1)
- return; // Fastpath for only one CPU.
-
rtp->n_ipis += cpumask_weight(cpu_online_mask);
schedule_on_each_cpu(rcu_tasks_be_rude);
}
@@ -1815,23 +1844,21 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)
static void rcu_tasks_initiate_self_tests(void)
{
- unsigned long j = jiffies;
-
pr_info("Running RCU-tasks wait API self tests\n");
#ifdef CONFIG_TASKS_RCU
- tests[0].runstart = j;
+ tests[0].runstart = jiffies;
synchronize_rcu_tasks();
call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_RUDE_RCU
- tests[1].runstart = j;
+ tests[1].runstart = jiffies;
synchronize_rcu_tasks_rude();
call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_TRACE_RCU
- tests[2].runstart = j;
+ tests[2].runstart = jiffies;
synchronize_rcu_tasks_trace();
call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
#endif
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 72913ce21258..42f7589e51e0 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -246,15 +246,12 @@ bool poll_state_synchronize_rcu(unsigned long oldstate)
EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
#ifdef CONFIG_KASAN_GENERIC
-void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
{
- if (head) {
- void *ptr = (void *) head - (unsigned long) func;
-
+ if (head)
kasan_record_aux_stack_noalloc(ptr);
- }
- __kvfree_call_rcu(head, func);
+ __kvfree_call_rcu(head, ptr);
}
EXPORT_SYMBOL_GPL(kvfree_call_rcu);
#endif
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cf34a961821a..8e880c09ab59 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -144,14 +144,16 @@ static int rcu_scheduler_fully_active __read_mostly;
static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
unsigned long gps, unsigned long flags);
-static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
-static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
static void sync_sched_exp_online_cleanup(int cpu);
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
+static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
+static bool rcu_init_invoked(void);
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
/*
* rcuc/rcub/rcuop kthread realtime priority. The "rcuop"
@@ -215,27 +217,6 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */
/*
- * Compute the mask of online CPUs for the specified rcu_node structure.
- * This will not be stable unless the rcu_node structure's ->lock is
- * held, but the bit corresponding to the current CPU will be stable
- * in most contexts.
- */
-static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
-{
- return READ_ONCE(rnp->qsmaskinitnext);
-}
-
-/*
- * Is the CPU corresponding to the specified rcu_data structure online
- * from RCU's perspective? This perspective is given by that structure's
- * ->qsmaskinitnext field rather than by the global cpu_online_mask.
- */
-static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
-{
- return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode));
-}
-
-/*
* Return true if an RCU grace period is in progress. The READ_ONCE()s
* permit this function to be invoked without holding the root rcu_node
* structure's ->lock, but of course results can be subject to change.
@@ -734,46 +715,6 @@ void rcu_request_urgent_qs_task(struct task_struct *t)
smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
}
-#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
-
-/*
- * Is the current CPU online as far as RCU is concerned?
- *
- * Disable preemption to avoid false positives that could otherwise
- * happen due to the current CPU number being sampled, this task being
- * preempted, its old CPU being taken offline, resuming on some other CPU,
- * then determining that its old CPU is now offline.
- *
- * Disable checking if in an NMI handler because we cannot safely
- * report errors from NMI handlers anyway. In addition, it is OK to use
- * RCU on an offline processor during initial boot, hence the check for
- * rcu_scheduler_fully_active.
- */
-bool rcu_lockdep_current_cpu_online(void)
-{
- struct rcu_data *rdp;
- bool ret = false;
-
- if (in_nmi() || !rcu_scheduler_fully_active)
- return true;
- preempt_disable_notrace();
- rdp = this_cpu_ptr(&rcu_data);
- /*
- * Strictly, we care here about the case where the current CPU is
- * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
- * not being up to date. So arch_spin_is_locked() might have a
- * false positive if it's held by some *other* CPU, but that's
- * OK because that just means a false *negative* on the warning.
- */
- if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock))
- ret = true;
- preempt_enable_notrace();
- return ret;
-}
-EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
-
-#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
-
/*
* When trying to report a quiescent state on behalf of some other CPU,
* it is our responsibility to check for and handle potential overflow
@@ -925,6 +866,24 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
rdp->rcu_iw_gp_seq = rnp->gp_seq;
irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
}
+
+ if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) {
+ int cpu = rdp->cpu;
+ struct rcu_snap_record *rsrp;
+ struct kernel_cpustat *kcsp;
+
+ kcsp = &kcpustat_cpu(cpu);
+
+ rsrp = &rdp->snap_record;
+ rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
+ rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
+ rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+ rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu);
+ rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu);
+ rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu);
+ rsrp->jiffies = jiffies;
+ rsrp->gp_seq = rdp->gp_seq;
+ }
}
return 0;
@@ -1350,13 +1309,6 @@ static void rcu_strict_gp_boundary(void *unused)
invoke_rcu_core();
}
-// Has rcu_init() been invoked? This is used (for example) to determine
-// whether spinlocks may be acquired safely.
-static bool rcu_init_invoked(void)
-{
- return !!rcu_state.n_online_cpus;
-}
-
// Make the polled API aware of the beginning of a grace period.
static void rcu_poll_gp_seq_start(unsigned long *snap)
{
@@ -2092,92 +2044,6 @@ rcu_check_quiescent_state(struct rcu_data *rdp)
}
/*
- * Near the end of the offline process. Trace the fact that this CPU
- * is going offline.
- */
-int rcutree_dying_cpu(unsigned int cpu)
-{
- bool blkd;
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_node *rnp = rdp->mynode;
-
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
- return 0;
-
- blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
- trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
- blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
- return 0;
-}
-
-/*
- * All CPUs for the specified rcu_node structure have gone offline,
- * and all tasks that were preempted within an RCU read-side critical
- * section while running on one of those CPUs have since exited their RCU
- * read-side critical section. Some other CPU is reporting this fact with
- * the specified rcu_node structure's ->lock held and interrupts disabled.
- * This function therefore goes up the tree of rcu_node structures,
- * clearing the corresponding bits in the ->qsmaskinit fields. Note that
- * the leaf rcu_node structure's ->qsmaskinit field has already been
- * updated.
- *
- * This function does check that the specified rcu_node structure has
- * all CPUs offline and no blocked tasks, so it is OK to invoke it
- * prematurely. That said, invoking it after the fact will cost you
- * a needless lock acquisition. So once it has done its work, don't
- * invoke it again.
- */
-static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
-{
- long mask;
- struct rcu_node *rnp = rnp_leaf;
-
- raw_lockdep_assert_held_rcu_node(rnp_leaf);
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
- WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
- WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
- return;
- for (;;) {
- mask = rnp->grpmask;
- rnp = rnp->parent;
- if (!rnp)
- break;
- raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- rnp->qsmaskinit &= ~mask;
- /* Between grace periods, so better already be zero! */
- WARN_ON_ONCE(rnp->qsmask);
- if (rnp->qsmaskinit) {
- raw_spin_unlock_rcu_node(rnp);
- /* irqs remain disabled. */
- return;
- }
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- }
-}
-
-/*
- * The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context. Do the remainder of the cleanup.
- * There can only be one CPU hotplug operation at a time, so no need for
- * explicit locking.
- */
-int rcutree_dead_cpu(unsigned int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
-
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
- return 0;
-
- WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
- /* Adjust any no-longer-needed kthreads. */
- rcu_boost_kthread_setaffinity(rnp, -1);
- // Stop-machine done, so allow nohz_full to disable tick.
- tick_dep_clear(TICK_DEP_BIT_RCU);
- return 0;
-}
-
-/*
* Invoke any RCU callbacks that have made it to the end of their grace
* period. Throttle as specified by rdp->blimit.
*/
@@ -2209,7 +2075,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
*/
rcu_nocb_lock_irqsave(rdp, flags);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
- pending = rcu_segcblist_n_cbs(&rdp->cblist);
+ pending = rcu_segcblist_get_seglen(&rdp->cblist, RCU_DONE_TAIL);
div = READ_ONCE(rcu_divisor);
div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
bl = max(rdp->blimit, pending >> div);
@@ -2727,10 +2593,11 @@ static void check_cb_ovld(struct rcu_data *rdp)
}
static void
-__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
+__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
{
static atomic_t doublefrees;
unsigned long flags;
+ bool lazy;
struct rcu_data *rdp;
bool was_alldone;
@@ -2755,6 +2622,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
kasan_record_aux_stack_noalloc(head);
local_irq_save(flags);
rdp = this_cpu_ptr(&rcu_data);
+ lazy = lazy_in && !rcu_async_should_hurry();
/* Add the callback to our list. */
if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
@@ -2876,13 +2744,15 @@ EXPORT_SYMBOL_GPL(call_rcu);
/**
* struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
+ * @list: List node. All blocks are linked between each other
+ * @gp_snap: Snapshot of RCU state for objects placed to this bulk
* @nr_records: Number of active pointers in the array
- * @next: Next bulk object in the block chain
* @records: Array of the kvfree_rcu() pointers
*/
struct kvfree_rcu_bulk_data {
+ struct list_head list;
+ unsigned long gp_snap;
unsigned long nr_records;
- struct kvfree_rcu_bulk_data *next;
void *records[];
};
@@ -2898,26 +2768,28 @@ struct kvfree_rcu_bulk_data {
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period
- * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
+ * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure
*/
struct kfree_rcu_cpu_work {
struct rcu_work rcu_work;
struct rcu_head *head_free;
- struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
+ struct list_head bulk_head_free[FREE_N_CHANNELS];
struct kfree_rcu_cpu *krcp;
};
/**
* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
* @head: List of kfree_rcu() objects not yet waiting for a grace period
- * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
+ * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
+ * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
* @initialized: The @rcu_work fields have been initialized
- * @count: Number of objects for which GP not started
+ * @head_count: Number of objects in rcu_head singular list
+ * @bulk_count: Number of objects in bulk-list
* @bkvcache:
* A simple cache list that contains objects for reuse purpose.
* In order to save some per-cpu space the list is singular.
@@ -2935,13 +2807,20 @@ struct kfree_rcu_cpu_work {
* the interactions with the slab allocators.
*/
struct kfree_rcu_cpu {
+ // Objects queued on a linked list
+ // through their rcu_head structures.
struct rcu_head *head;
- struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
+ unsigned long head_gp_snap;
+ atomic_t head_count;
+
+ // Objects queued on a bulk-list.
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ atomic_t bulk_count[FREE_N_CHANNELS];
+
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
raw_spinlock_t lock;
struct delayed_work monitor_work;
bool initialized;
- int count;
struct delayed_work page_cache_work;
atomic_t backoff_page_cache_fill;
@@ -3029,29 +2908,87 @@ drain_page_cache(struct kfree_rcu_cpu *krcp)
return freed;
}
+static void
+kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
+ struct kvfree_rcu_bulk_data *bnode, int idx)
+{
+ unsigned long flags;
+ int i;
+
+ debug_rcu_bhead_unqueue(bnode);
+
+ rcu_lock_acquire(&rcu_callback_map);
+ if (idx == 0) { // kmalloc() / kfree().
+ trace_rcu_invoke_kfree_bulk_callback(
+ rcu_state.name, bnode->nr_records,
+ bnode->records);
+
+ kfree_bulk(bnode->nr_records, bnode->records);
+ } else { // vmalloc() / vfree().
+ for (i = 0; i < bnode->nr_records; i++) {
+ trace_rcu_invoke_kvfree_callback(
+ rcu_state.name, bnode->records[i], 0);
+
+ vfree(bnode->records[i]);
+ }
+ }
+ rcu_lock_release(&rcu_callback_map);
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ if (put_cached_bnode(krcp, bnode))
+ bnode = NULL;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (bnode)
+ free_page((unsigned long) bnode);
+
+ cond_resched_tasks_rcu_qs();
+}
+
+static void
+kvfree_rcu_list(struct rcu_head *head)
+{
+ struct rcu_head *next;
+
+ for (; head; head = next) {
+ void *ptr = (void *) head->func;
+ unsigned long offset = (void *) head - ptr;
+
+ next = head->next;
+ debug_rcu_head_unqueue((struct rcu_head *)ptr);
+ rcu_lock_acquire(&rcu_callback_map);
+ trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
+
+ if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
+ kvfree(ptr);
+
+ rcu_lock_release(&rcu_callback_map);
+ cond_resched_tasks_rcu_qs();
+ }
+}
+
/*
* This function is invoked in workqueue context after a grace period.
- * It frees all the objects queued on ->bkvhead_free or ->head_free.
+ * It frees all the objects queued on ->bulk_head_free or ->head_free.
*/
static void kfree_rcu_work(struct work_struct *work)
{
unsigned long flags;
- struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
- struct rcu_head *head, *next;
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ struct rcu_head *head;
struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp;
- int i, j;
+ int i;
krwp = container_of(to_rcu_work(work),
- struct kfree_rcu_cpu_work, rcu_work);
+ struct kfree_rcu_cpu_work, rcu_work);
krcp = krwp->krcp;
raw_spin_lock_irqsave(&krcp->lock, flags);
// Channels 1 and 2.
- for (i = 0; i < FREE_N_CHANNELS; i++) {
- bkvhead[i] = krwp->bkvhead_free[i];
- krwp->bkvhead_free[i] = NULL;
- }
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
// Channel 3.
head = krwp->head_free;
@@ -3060,39 +2997,9 @@ static void kfree_rcu_work(struct work_struct *work)
// Handle the first two channels.
for (i = 0; i < FREE_N_CHANNELS; i++) {
- for (; bkvhead[i]; bkvhead[i] = bnext) {
- bnext = bkvhead[i]->next;
- debug_rcu_bhead_unqueue(bkvhead[i]);
-
- rcu_lock_acquire(&rcu_callback_map);
- if (i == 0) { // kmalloc() / kfree().
- trace_rcu_invoke_kfree_bulk_callback(
- rcu_state.name, bkvhead[i]->nr_records,
- bkvhead[i]->records);
-
- kfree_bulk(bkvhead[i]->nr_records,
- bkvhead[i]->records);
- } else { // vmalloc() / vfree().
- for (j = 0; j < bkvhead[i]->nr_records; j++) {
- trace_rcu_invoke_kvfree_callback(
- rcu_state.name,
- bkvhead[i]->records[j], 0);
-
- vfree(bkvhead[i]->records[j]);
- }
- }
- rcu_lock_release(&rcu_callback_map);
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- if (put_cached_bnode(krcp, bkvhead[i]))
- bkvhead[i] = NULL;
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
- if (bkvhead[i])
- free_page((unsigned long) bkvhead[i]);
-
- cond_resched_tasks_rcu_qs();
- }
+ // Start from the tail page, so a GP is likely passed for it.
+ list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
}
/*
@@ -3102,21 +3009,7 @@ static void kfree_rcu_work(struct work_struct *work)
* queued on a linked list through their rcu_head structures.
* This list is named "Channel 3".
*/
- for (; head; head = next) {
- unsigned long offset = (unsigned long)head->func;
- void *ptr = (void *)head - offset;
-
- next = head->next;
- debug_rcu_head_unqueue((struct rcu_head *)ptr);
- rcu_lock_acquire(&rcu_callback_map);
- trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
-
- if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
- kvfree(ptr);
-
- rcu_lock_release(&rcu_callback_map);
- cond_resched_tasks_rcu_qs();
- }
+ kvfree_rcu_list(head);
}
static bool
@@ -3125,10 +3018,21 @@ need_offload_krc(struct kfree_rcu_cpu *krcp)
int i;
for (i = 0; i < FREE_N_CHANNELS; i++)
- if (krcp->bkvhead[i])
+ if (!list_empty(&krcp->bulk_head[i]))
return true;
- return !!krcp->head;
+ return !!READ_ONCE(krcp->head);
+}
+
+static int krc_count(struct kfree_rcu_cpu *krcp)
+{
+ int sum = atomic_read(&krcp->head_count);
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ sum += atomic_read(&krcp->bulk_count[i]);
+
+ return sum;
}
static void
@@ -3136,7 +3040,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
{
long delay, delay_left;
- delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
+ delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
if (delayed_work_pending(&krcp->monitor_work)) {
delay_left = krcp->monitor_work.timer.expires - jiffies;
if (delay < delay_left)
@@ -3146,6 +3050,44 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
queue_delayed_work(system_wq, &krcp->monitor_work, delay);
}
+static void
+kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
+{
+ struct list_head bulk_ready[FREE_N_CHANNELS];
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct rcu_head *head_ready = NULL;
+ unsigned long flags;
+ int i;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ INIT_LIST_HEAD(&bulk_ready[i]);
+
+ list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
+ if (!poll_state_synchronize_rcu(bnode->gp_snap))
+ break;
+
+ atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
+ list_move(&bnode->list, &bulk_ready[i]);
+ }
+ }
+
+ if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
+ head_ready = krcp->head;
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
+ }
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
+ }
+
+ if (head_ready)
+ kvfree_rcu_list(head_ready);
+}
+
/*
* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
*/
@@ -3156,26 +3098,31 @@ static void kfree_rcu_monitor(struct work_struct *work)
unsigned long flags;
int i, j;
+ // Drain ready for reclaim.
+ kvfree_rcu_drain_ready(krcp);
+
raw_spin_lock_irqsave(&krcp->lock, flags);
// Attempt to start a new batch.
for (i = 0; i < KFREE_N_BATCHES; i++) {
struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
- // Try to detach bkvhead or head and attach it over any
+ // Try to detach bulk_head or head and attach it over any
// available corresponding free channel. It can be that
// a previous RCU batch is in progress, it means that
// immediately to queue another one is not possible so
// in that case the monitor work is rearmed.
- if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
- (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
- (krcp->head && !krwp->head_free)) {
+ if ((!list_empty(&krcp->bulk_head[0]) && list_empty(&krwp->bulk_head_free[0])) ||
+ (!list_empty(&krcp->bulk_head[1]) && list_empty(&krwp->bulk_head_free[1])) ||
+ (READ_ONCE(krcp->head) && !krwp->head_free)) {
+
// Channel 1 corresponds to the SLAB-pointer bulk path.
// Channel 2 corresponds to vmalloc-pointer bulk path.
for (j = 0; j < FREE_N_CHANNELS; j++) {
- if (!krwp->bkvhead_free[j]) {
- krwp->bkvhead_free[j] = krcp->bkvhead[j];
- krcp->bkvhead[j] = NULL;
+ if (list_empty(&krwp->bulk_head_free[j])) {
+ atomic_set(&krcp->bulk_count[j], 0);
+ list_replace_init(&krcp->bulk_head[j],
+ &krwp->bulk_head_free[j]);
}
}
@@ -3183,11 +3130,10 @@ static void kfree_rcu_monitor(struct work_struct *work)
// objects queued on the linked list.
if (!krwp->head_free) {
krwp->head_free = krcp->head;
- krcp->head = NULL;
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
}
- WRITE_ONCE(krcp->count, 0);
-
// One work is per one batch, so there are three
// "free channels", the batch can handle. It can
// be that the work is in the pending state when
@@ -3197,6 +3143,8 @@ static void kfree_rcu_monitor(struct work_struct *work)
}
}
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
// If there is nothing to detach, it means that our job is
// successfully done here. In case of having at least one
// of the channels that is still busy we should rearm the
@@ -3204,8 +3152,6 @@ static void kfree_rcu_monitor(struct work_struct *work)
// still in progress.
if (need_offload_krc(krcp))
schedule_delayed_monitor_work(krcp);
-
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
static enum hrtimer_restart
@@ -3288,10 +3234,11 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
return false;
idx = !!is_vmalloc_addr(ptr);
+ bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
+ struct kvfree_rcu_bulk_data, list);
/* Check if a new block is required. */
- if (!(*krcp)->bkvhead[idx] ||
- (*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
+ if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
bnode = get_cached_bnode(*krcp);
if (!bnode && can_alloc) {
krc_this_cpu_unlock(*krcp, *flags);
@@ -3315,17 +3262,15 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
if (!bnode)
return false;
- /* Initialize the new block. */
+ // Initialize the new block and attach it.
bnode->nr_records = 0;
- bnode->next = (*krcp)->bkvhead[idx];
-
- /* Attach it to the head. */
- (*krcp)->bkvhead[idx] = bnode;
+ list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
}
- /* Finally insert. */
- (*krcp)->bkvhead[idx]->records
- [(*krcp)->bkvhead[idx]->nr_records++] = ptr;
+ // Finally insert and update the GP for this page.
+ bnode->records[bnode->nr_records++] = ptr;
+ bnode->gp_snap = get_state_synchronize_rcu();
+ atomic_inc(&(*krcp)->bulk_count[idx]);
return true;
}
@@ -3342,26 +3287,21 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
* be free'd in workqueue context. This allows us to: batch requests together to
* reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
*/
-void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
{
unsigned long flags;
struct kfree_rcu_cpu *krcp;
bool success;
- void *ptr;
- if (head) {
- ptr = (void *) head - (unsigned long) func;
- } else {
- /*
- * Please note there is a limitation for the head-less
- * variant, that is why there is a clear rule for such
- * objects: it can be used from might_sleep() context
- * only. For other places please embed an rcu_head to
- * your data.
- */
+ /*
+ * Please note there is a limitation for the head-less
+ * variant, that is why there is a clear rule for such
+ * objects: it can be used from might_sleep() context
+ * only. For other places please embed an rcu_head to
+ * your data.
+ */
+ if (!head)
might_sleep();
- ptr = (unsigned long *) func;
- }
// Queue the object but don't yet schedule the batch.
if (debug_rcu_head_queue(ptr)) {
@@ -3382,14 +3322,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
// Inline if kvfree_rcu(one_arg) call.
goto unlock_return;
- head->func = func;
+ head->func = ptr;
head->next = krcp->head;
- krcp->head = head;
+ WRITE_ONCE(krcp->head, head);
+ atomic_inc(&krcp->head_count);
+
+ // Take a snapshot for this krcp.
+ krcp->head_gp_snap = get_state_synchronize_rcu();
success = true;
}
- WRITE_ONCE(krcp->count, krcp->count + 1);
-
// Set timer to drain after KFREE_DRAIN_JIFFIES.
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
schedule_delayed_monitor_work(krcp);
@@ -3420,7 +3362,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- count += READ_ONCE(krcp->count);
+ count += krc_count(krcp);
count += READ_ONCE(krcp->nr_bkv_objs);
atomic_set(&krcp->backoff_page_cache_fill, 1);
}
@@ -3437,7 +3379,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
int count;
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- count = krcp->count;
+ count = krc_count(krcp);
count += drain_page_cache(krcp);
kfree_rcu_monitor(&krcp->monitor_work.work);
@@ -3461,15 +3403,12 @@ static struct shrinker kfree_rcu_shrinker = {
void __init kfree_rcu_scheduler_running(void)
{
int cpu;
- unsigned long flags;
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- raw_spin_lock_irqsave(&krcp->lock, flags);
if (need_offload_krc(krcp))
schedule_delayed_monitor_work(krcp);
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
}
@@ -3485,9 +3424,10 @@ void __init kfree_rcu_scheduler_running(void)
*/
static int rcu_blocking_is_gp(void)
{
- if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) {
+ might_sleep();
return false;
- might_sleep(); /* Check for RCU read-side critical section. */
+ }
return true;
}
@@ -3711,7 +3651,9 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
* If @false is returned, it is the caller's responsibility to invoke this
* function later on until it does return @true. Alternatively, the caller
* can explicitly wait for a grace period, for example, by passing @oldstate
- * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
+ * to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited()
+ * on the one hand or by directly invoking either synchronize_rcu() or
+ * synchronize_rcu_expedited() on the other.
*
* Yes, this function does not take counter wrap into account.
* But counter wrap is harmless. If the counter wraps, we have waited for
@@ -3722,6 +3664,12 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
* completed. Alternatively, they can use get_completed_synchronize_rcu()
* to get a guaranteed-completed grace-period state.
*
+ * In addition, because oldstate compresses the grace-period state for
+ * both normal and expedited grace periods into a single unsigned long,
+ * it can miss a grace period when synchronize_rcu() runs concurrently
+ * with synchronize_rcu_expedited(). If this is unacceptable, please
+ * instead use the _full() variant of these polling APIs.
+ *
* This function provides the same memory-ordering guarantees that
* would be provided by a synchronize_rcu() that was invoked at the call
* to the function that provided @oldstate, and that returned at the end
@@ -4080,6 +4028,155 @@ retry:
EXPORT_SYMBOL_GPL(rcu_barrier);
/*
+ * Compute the mask of online CPUs for the specified rcu_node structure.
+ * This will not be stable unless the rcu_node structure's ->lock is
+ * held, but the bit corresponding to the current CPU will be stable
+ * in most contexts.
+ */
+static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+{
+ return READ_ONCE(rnp->qsmaskinitnext);
+}
+
+/*
+ * Is the CPU corresponding to the specified rcu_data structure online
+ * from RCU's perspective? This perspective is given by that structure's
+ * ->qsmaskinitnext field rather than by the global cpu_online_mask.
+ */
+static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
+{
+ return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode));
+}
+
+#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
+
+/*
+ * Is the current CPU online as far as RCU is concerned?
+ *
+ * Disable preemption to avoid false positives that could otherwise
+ * happen due to the current CPU number being sampled, this task being
+ * preempted, its old CPU being taken offline, resuming on some other CPU,
+ * then determining that its old CPU is now offline.
+ *
+ * Disable checking if in an NMI handler because we cannot safely
+ * report errors from NMI handlers anyway. In addition, it is OK to use
+ * RCU on an offline processor during initial boot, hence the check for
+ * rcu_scheduler_fully_active.
+ */
+bool rcu_lockdep_current_cpu_online(void)
+{
+ struct rcu_data *rdp;
+ bool ret = false;
+
+ if (in_nmi() || !rcu_scheduler_fully_active)
+ return true;
+ preempt_disable_notrace();
+ rdp = this_cpu_ptr(&rcu_data);
+ /*
+ * Strictly, we care here about the case where the current CPU is
+ * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
+ * not being up to date. So arch_spin_is_locked() might have a
+ * false positive if it's held by some *other* CPU, but that's
+ * OK because that just means a false *negative* on the warning.
+ */
+ if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock))
+ ret = true;
+ preempt_enable_notrace();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
+
+#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
+
+// Has rcu_init() been invoked? This is used (for example) to determine
+// whether spinlocks may be acquired safely.
+static bool rcu_init_invoked(void)
+{
+ return !!rcu_state.n_online_cpus;
+}
+
+/*
+ * Near the end of the offline process. Trace the fact that this CPU
+ * is going offline.
+ */
+int rcutree_dying_cpu(unsigned int cpu)
+{
+ bool blkd;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_node *rnp = rdp->mynode;
+
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return 0;
+
+ blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
+ trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
+ blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
+ return 0;
+}
+
+/*
+ * All CPUs for the specified rcu_node structure have gone offline,
+ * and all tasks that were preempted within an RCU read-side critical
+ * section while running on one of those CPUs have since exited their RCU
+ * read-side critical section. Some other CPU is reporting this fact with
+ * the specified rcu_node structure's ->lock held and interrupts disabled.
+ * This function therefore goes up the tree of rcu_node structures,
+ * clearing the corresponding bits in the ->qsmaskinit fields. Note that
+ * the leaf rcu_node structure's ->qsmaskinit field has already been
+ * updated.
+ *
+ * This function does check that the specified rcu_node structure has
+ * all CPUs offline and no blocked tasks, so it is OK to invoke it
+ * prematurely. That said, invoking it after the fact will cost you
+ * a needless lock acquisition. So once it has done its work, don't
+ * invoke it again.
+ */
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+ long mask;
+ struct rcu_node *rnp = rnp_leaf;
+
+ raw_lockdep_assert_held_rcu_node(rnp_leaf);
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+ WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
+ WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
+ return;
+ for (;;) {
+ mask = rnp->grpmask;
+ rnp = rnp->parent;
+ if (!rnp)
+ break;
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
+ rnp->qsmaskinit &= ~mask;
+ /* Between grace periods, so better already be zero! */
+ WARN_ON_ONCE(rnp->qsmask);
+ if (rnp->qsmaskinit) {
+ raw_spin_unlock_rcu_node(rnp);
+ /* irqs remain disabled. */
+ return;
+ }
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
+}
+
+/*
+ * The CPU has been completely removed, and some other CPU is reporting
+ * this fact from process context. Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
+ */
+int rcutree_dead_cpu(unsigned int cpu)
+{
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return 0;
+
+ WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
+ // Stop-machine done, so allow nohz_full to disable tick.
+ tick_dep_clear(TICK_DEP_BIT_RCU);
+ return 0;
+}
+
+/*
* Propagate ->qsinitmask bits up the rcu_node tree to account for the
* first CPU in a given leaf rcu_node structure coming online. The caller
* must hold the corresponding leaf rcu_node ->lock with interrupts
@@ -4408,11 +4505,13 @@ static int rcu_pm_notify(struct notifier_block *self,
switch (action) {
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
+ rcu_async_hurry();
rcu_expedite_gp();
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
rcu_unexpedite_gp();
+ rcu_async_relax();
break;
default:
break;
@@ -4766,7 +4865,7 @@ struct workqueue_struct *rcu_gp_wq;
static void __init kfree_rcu_batch_init(void)
{
int cpu;
- int i;
+ int i, j;
/* Clamp it to [0:100] seconds interval. */
if (rcu_delay_page_cache_fill_msec < 0 ||
@@ -4786,8 +4885,14 @@ static void __init kfree_rcu_batch_init(void)
for (i = 0; i < KFREE_N_BATCHES; i++) {
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
krcp->krw_arr[i].krcp = krcp;
+
+ for (j = 0; j < FREE_N_CHANNELS; j++)
+ INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
}
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ INIT_LIST_HEAD(&krcp->bulk_head[i]);
+
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
@@ -4838,6 +4943,8 @@ void __init rcu_init(void)
// Kick-start any polled grace periods that started early.
if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1))
(void)start_poll_synchronize_rcu_expedited();
+
+ rcu_test_sync_prims();
}
#include "tree_stall.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index fcb5d696eb17..192536916f9a 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -158,6 +158,23 @@ union rcu_noqs {
u16 s; /* Set of bits, aggregate OR here. */
};
+/*
+ * Record the snapshot of the core stats at half of the first RCU stall timeout.
+ * The member gp_seq is used to ensure that all members are updated only once
+ * during the sampling period. The snapshot is taken only if this gp_seq is not
+ * equal to rdp->gp_seq.
+ */
+struct rcu_snap_record {
+ unsigned long gp_seq; /* Track rdp->gp_seq counter */
+ u64 cputime_irq; /* Accumulated cputime of hard irqs */
+ u64 cputime_softirq;/* Accumulated cputime of soft irqs */
+ u64 cputime_system; /* Accumulated cputime of kernel tasks */
+ unsigned long nr_hardirqs; /* Accumulated number of hard irqs */
+ unsigned int nr_softirqs; /* Accumulated number of soft irqs */
+ unsigned long long nr_csw; /* Accumulated number of task switches */
+ unsigned long jiffies; /* Track jiffies value */
+};
+
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
@@ -262,6 +279,8 @@ struct rcu_data {
short rcu_onl_gp_flags; /* ->gp_flags at last online. */
unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
+ struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */
+ /* the first RCU stall timeout */
long lazy_len; /* Length of buffered lazy callbacks. */
int cpu;
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index ed6c3cce28f2..249c2967d9e6 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -11,6 +11,7 @@
static void rcu_exp_handler(void *unused);
static int rcu_print_task_exp_stall(struct rcu_node *rnp);
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp);
/*
* Record the start of an expedited grace period.
@@ -667,8 +668,11 @@ static void synchronize_rcu_expedited_wait(void)
mask = leaf_node_cpu_bit(rnp, cpu);
if (!(READ_ONCE(rnp->expmask) & mask))
continue;
+ preempt_disable(); // For smp_processor_id() in dump_cpu_task().
dump_cpu_task(cpu);
+ preempt_enable();
}
+ rcu_exp_print_detail_task_stall_rnp(rnp);
}
jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;
panic_on_rcu_stall();
@@ -811,6 +815,36 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
return ndetected;
}
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, dumping the stack of each that is blocking the current
+ * expedited grace period.
+ */
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct task_struct *t;
+
+ if (!rcu_exp_stall_task_details)
+ return;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!READ_ONCE(rnp->exp_tasks)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ t = list_entry(rnp->exp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ /*
+ * We could be printing a lot while holding a spinlock.
+ * Avoid triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+ sched_show_task(t);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
#else /* #ifdef CONFIG_PREEMPT_RCU */
/* Request an expedited quiescent state. */
@@ -883,6 +917,15 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
return 0;
}
+/*
+ * Because preemptible RCU does not exist, we never have to print out
+ * tasks blocked within RCU read-side critical sections that are blocking
+ * the current expedited grace period.
+ */
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+}
+
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/**
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 5653560573e2..b10b8349bb2a 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -39,7 +39,7 @@ int rcu_exp_jiffies_till_stall_check(void)
// CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range.
// The minimum clamped value is "2UL", because at least one full
// tick has to be guaranteed.
- till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 21UL * HZ);
+ till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 300UL * HZ);
if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout)
WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check));
@@ -428,6 +428,35 @@ static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp
return j > 2 * HZ;
}
+static void print_cpu_stat_info(int cpu)
+{
+ struct rcu_snap_record rsr, *rsrp;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct kernel_cpustat *kcsp = &kcpustat_cpu(cpu);
+
+ if (!rcu_cpu_stall_cputime)
+ return;
+
+ rsrp = &rdp->snap_record;
+ if (rsrp->gp_seq != rdp->gp_seq)
+ return;
+
+ rsr.cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
+ rsr.cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
+ rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+
+ pr_err("\t hardirqs softirqs csw/system\n");
+ pr_err("\t number: %8ld %10d %12lld\n",
+ kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs,
+ kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs,
+ nr_context_switches_cpu(cpu) - rsrp->nr_csw);
+ pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n",
+ div_u64(rsr.cputime_irq - rsrp->cputime_irq, NSEC_PER_MSEC),
+ div_u64(rsr.cputime_softirq - rsrp->cputime_softirq, NSEC_PER_MSEC),
+ div_u64(rsr.cputime_system - rsrp->cputime_system, NSEC_PER_MSEC),
+ jiffies_to_msecs(jiffies - rsrp->jiffies));
+}
+
/*
* Print out diagnostic information for the specified stalled CPU.
*
@@ -484,6 +513,8 @@ static void print_cpu_stall_info(int cpu)
data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
rcuc_starved ? buf : "",
falsepositive ? " (false positive?)" : "");
+
+ print_cpu_stat_info(cpu);
}
/* Complain about starvation of grace-period kthread. */
@@ -588,7 +619,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
+ pr_err("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
smp_processor_id(), (long)(jiffies - gps),
(long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
if (ndetected) {
@@ -649,7 +680,7 @@ static void print_cpu_stall(unsigned long gps)
raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
+ pr_err("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
jiffies - gps,
(long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f5e6a2f95a2a..19bf6fa3ee6a 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -144,8 +144,45 @@ bool rcu_gp_is_normal(void)
}
EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
-static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
+static atomic_t rcu_async_hurry_nesting = ATOMIC_INIT(1);
+/*
+ * Should call_rcu() callbacks be processed with urgency or are
+ * they OK being executed with arbitrary delays?
+ */
+bool rcu_async_should_hurry(void)
+{
+ return !IS_ENABLED(CONFIG_RCU_LAZY) ||
+ atomic_read(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_should_hurry);
+
+/**
+ * rcu_async_hurry - Make future async RCU callbacks not lazy.
+ *
+ * After a call to this function, future calls to call_rcu()
+ * will be processed in a timely fashion.
+ */
+void rcu_async_hurry(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_LAZY))
+ atomic_inc(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_hurry);
+/**
+ * rcu_async_relax - Make future async RCU callbacks lazy.
+ *
+ * After a call to this function, future calls to call_rcu()
+ * will be processed in a lazy fashion.
+ */
+void rcu_async_relax(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_LAZY))
+ atomic_dec(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_relax);
+
+static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
/*
* Should normal grace-period primitives be expedited? Intended for
* use within RCU. Note that this function takes the rcu_expedited
@@ -195,6 +232,7 @@ static bool rcu_boot_ended __read_mostly;
void rcu_end_inkernel_boot(void)
{
rcu_unexpedite_gp();
+ rcu_async_relax();
if (rcu_normal_after_boot)
WRITE_ONCE(rcu_normal, 1);
rcu_boot_ended = true;
@@ -220,6 +258,7 @@ void rcu_test_sync_prims(void)
{
if (!IS_ENABLED(CONFIG_PROVE_RCU))
return;
+ pr_info("Running RCU synchronous self tests\n");
synchronize_rcu();
synchronize_rcu_expedited();
}
@@ -508,6 +547,10 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_timeout, int, 0644);
int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT;
module_param(rcu_exp_cpu_stall_timeout, int, 0644);
+int rcu_cpu_stall_cputime __read_mostly = IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME);
+module_param(rcu_cpu_stall_cputime, int, 0644);
+bool rcu_exp_stall_task_details __read_mostly;
+module_param(rcu_exp_stall_task_details, bool, 0644);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
// Suppress boot-time RCU CPU stall warnings and rcutorture writer stall
@@ -555,9 +598,12 @@ struct early_boot_kfree_rcu {
static void early_boot_test_call_rcu(void)
{
static struct rcu_head head;
+ int idx;
static struct rcu_head shead;
struct early_boot_kfree_rcu *rhp;
+ idx = srcu_down_read(&early_srcu);
+ srcu_up_read(&early_srcu, idx);
call_rcu(&head, test_callback);
early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
call_srcu(&early_srcu, &shead, test_callback);
@@ -586,6 +632,7 @@ static int rcu_verify_early_boot_tests(void)
early_boot_test_counter++;
srcu_barrier(&early_srcu);
WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
+ cleanup_srcu_struct(&early_srcu);
}
if (rcu_self_test_counter != early_boot_test_counter) {
WARN_ON(1);
diff --git a/kernel/relay.c b/kernel/relay.c
index ef12532168d9..9aa70ae53d24 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -91,7 +91,7 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
return -EINVAL;
vma->vm_ops = &relay_file_mmap_ops;
- vma->vm_flags |= VM_DONTEXPAND;
+ vm_flags_set(vma, VM_DONTEXPAND);
vma->vm_private_data = buf;
return 0;
diff --git a/kernel/resource.c b/kernel/resource.c
index ddbbacb9fb50..b1763b2fd7ef 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1343,20 +1343,6 @@ retry:
continue;
}
- /*
- * All memory regions added from memory-hotplug path have the
- * flag IORESOURCE_SYSTEM_RAM. If the resource does not have
- * this flag, we know that we are dealing with a resource coming
- * from HMM/devm. HMM/devm use another mechanism to add/release
- * a resource. This goes via devm_request_mem_region and
- * devm_release_mem_region.
- * HMM/devm take care to release their resources when they want,
- * so if we are dealing with them, let us just back off here.
- */
- if (!(res->flags & IORESOURCE_SYSRAM)) {
- break;
- }
-
if (!(res->flags & IORESOURCE_MEM))
break;
diff --git a/kernel/rseq.c b/kernel/rseq.c
index d38ab944105d..9de6e35fe679 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -18,6 +18,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/rseq.h>
+/* The original rseq structure size (including padding) is 32 bytes. */
+#define ORIG_RSEQ_SIZE 32
+
#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
@@ -82,15 +85,25 @@
* F1. <failure>
*/
-static int rseq_update_cpu_id(struct task_struct *t)
+static int rseq_update_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id = raw_smp_processor_id();
struct rseq __user *rseq = t->rseq;
+ u32 cpu_id = raw_smp_processor_id();
+ u32 node_id = cpu_to_node(cpu_id);
+ u32 mm_cid = task_mm_cid(t);
- if (!user_write_access_begin(rseq, sizeof(*rseq)))
+ WARN_ON_ONCE((int) mm_cid < 0);
+ if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
+ unsafe_put_user(node_id, &rseq->node_id, efault_end);
+ unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end);
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally updated only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
user_write_access_end();
trace_rseq_update(t);
return 0;
@@ -101,9 +114,10 @@ efault:
return -EFAULT;
}
-static int rseq_reset_rseq_cpu_id(struct task_struct *t)
+static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
+ u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
+ mm_cid = 0;
/*
* Reset cpu_id_start to its initial state (0).
@@ -117,6 +131,21 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
*/
if (put_user(cpu_id, &t->rseq->cpu_id))
return -EFAULT;
+ /*
+ * Reset node_id to its initial state (0).
+ */
+ if (put_user(node_id, &t->rseq->node_id))
+ return -EFAULT;
+ /*
+ * Reset mm_cid to its initial state (0).
+ */
+ if (put_user(mm_cid, &t->rseq->mm_cid))
+ return -EFAULT;
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally reset only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
return 0;
}
@@ -301,7 +330,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
if (unlikely(ret < 0))
goto error;
}
- if (unlikely(rseq_update_cpu_id(t)))
+ if (unlikely(rseq_update_cpu_node_id(t)))
goto error;
return;
@@ -344,15 +373,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
/* Unregister rseq for current thread. */
if (current->rseq != rseq || !current->rseq)
return -EINVAL;
- if (rseq_len != sizeof(*rseq))
+ if (rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
- ret = rseq_reset_rseq_cpu_id(current);
+ ret = rseq_reset_rseq_cpu_node_id(current);
if (ret)
return ret;
current->rseq = NULL;
current->rseq_sig = 0;
+ current->rseq_len = 0;
return 0;
}
@@ -365,7 +395,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
* the provided address differs from the prior
* one.
*/
- if (current->rseq != rseq || rseq_len != sizeof(*rseq))
+ if (current->rseq != rseq || rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -374,15 +404,24 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
}
/*
- * If there was no rseq previously registered,
- * ensure the provided rseq is properly aligned and valid.
+ * If there was no rseq previously registered, ensure the provided rseq
+ * is properly aligned, as communcated to user-space through the ELF
+ * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq
+ * size, the required alignment is the original struct rseq alignment.
+ *
+ * In order to be valid, rseq_len is either the original rseq size, or
+ * large enough to contain all supported fields, as communicated to
+ * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
*/
- if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
- rseq_len != sizeof(*rseq))
+ if (rseq_len < ORIG_RSEQ_SIZE ||
+ (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) ||
+ (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
+ rseq_len < offsetof(struct rseq, end))))
return -EINVAL;
if (!access_ok(rseq, rseq_len))
return -EFAULT;
current->rseq = rseq;
+ current->rseq_len = rseq_len;
current->rseq_sig = sig;
/*
* If rseq was previously inactive, and has just been
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e374c0c923da..5732fa75ebab 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -93,7 +93,7 @@ struct sched_clock_data {
static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
-notrace static inline struct sched_clock_data *this_scd(void)
+static __always_inline struct sched_clock_data *this_scd(void)
{
return this_cpu_ptr(&sched_clock_data);
}
@@ -244,12 +244,12 @@ late_initcall(sched_clock_init_late);
* min, max except they take wrapping into account
*/
-notrace static inline u64 wrap_min(u64 x, u64 y)
+static __always_inline u64 wrap_min(u64 x, u64 y)
{
return (s64)(x - y) < 0 ? x : y;
}
-notrace static inline u64 wrap_max(u64 x, u64 y)
+static __always_inline u64 wrap_max(u64 x, u64 y)
{
return (s64)(x - y) > 0 ? x : y;
}
@@ -260,7 +260,7 @@ notrace static inline u64 wrap_max(u64 x, u64 y)
* - filter out backward motion
* - use the GTOD tick value to create a window to filter crazy TSC values
*/
-notrace static u64 sched_clock_local(struct sched_clock_data *scd)
+static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
{
u64 now, clock, old_clock, min_clock, max_clock, gtod;
s64 delta;
@@ -287,13 +287,28 @@ again:
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
- if (!try_cmpxchg64(&scd->clock, &old_clock, clock))
+ if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock))
goto again;
return clock;
}
-notrace static u64 sched_clock_remote(struct sched_clock_data *scd)
+noinstr u64 local_clock(void)
+{
+ u64 clock;
+
+ if (static_branch_likely(&__sched_clock_stable))
+ return sched_clock() + __sched_clock_offset;
+
+ preempt_disable_notrace();
+ clock = sched_clock_local(this_scd());
+ preempt_enable_notrace();
+
+ return clock;
+}
+EXPORT_SYMBOL_GPL(local_clock);
+
+static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
{
struct sched_clock_data *my_scd = this_scd();
u64 this_clock, remote_clock;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bb1ee6d7bdde..488655f2319f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -152,7 +152,7 @@ __read_mostly int scheduler_running;
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
/* kernel prio, less is more */
-static inline int __task_prio(struct task_struct *p)
+static inline int __task_prio(const struct task_struct *p)
{
if (p->sched_class == &stop_sched_class) /* trumps deadline */
return -2;
@@ -174,7 +174,8 @@ static inline int __task_prio(struct task_struct *p)
*/
/* real prio, less is less */
-static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+static inline bool prio_less(const struct task_struct *a,
+ const struct task_struct *b, bool in_fi)
{
int pa = __task_prio(a), pb = __task_prio(b);
@@ -194,7 +195,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool
return false;
}
-static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
+static inline bool __sched_core_less(const struct task_struct *a,
+ const struct task_struct *b)
{
if (a->core_cookie < b->core_cookie)
return true;
@@ -2951,8 +2953,11 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
}
if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
- if (cpumask_equal(&p->cpus_mask, ctx->new_mask))
+ if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) {
+ if (ctx->flags & SCA_USER)
+ swap(p->user_cpus_ptr, ctx->user_mask);
goto out;
+ }
if (WARN_ON_ONCE(p == current &&
is_migration_disabled(p) &&
@@ -3672,14 +3677,39 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
}
/*
- * Mark the task runnable and perform wakeup-preemption.
+ * Mark the task runnable.
*/
-static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
- struct rq_flags *rf)
+static inline void ttwu_do_wakeup(struct task_struct *p)
{
- check_preempt_curr(rq, p, wake_flags);
WRITE_ONCE(p->__state, TASK_RUNNING);
trace_sched_wakeup(p);
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+ struct rq_flags *rf)
+{
+ int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
+
+ lockdep_assert_rq_held(rq);
+
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+
+#ifdef CONFIG_SMP
+ if (wake_flags & WF_MIGRATED)
+ en_flags |= ENQUEUE_MIGRATED;
+ else
+#endif
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ activate_task(rq, p, en_flags);
+ check_preempt_curr(rq, p, wake_flags);
+
+ ttwu_do_wakeup(p);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
@@ -3709,31 +3739,6 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
#endif
}
-static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
- struct rq_flags *rf)
-{
- int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
-
- lockdep_assert_rq_held(rq);
-
- if (p->sched_contributes_to_load)
- rq->nr_uninterruptible--;
-
-#ifdef CONFIG_SMP
- if (wake_flags & WF_MIGRATED)
- en_flags |= ENQUEUE_MIGRATED;
- else
-#endif
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
- activate_task(rq, p, en_flags);
- ttwu_do_wakeup(rq, p, wake_flags, rf);
-}
-
/*
* Consider @p being inside a wait loop:
*
@@ -3767,9 +3772,15 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
- /* check_preempt_curr() may use rq clock */
- update_rq_clock(rq);
- ttwu_do_wakeup(rq, p, wake_flags, &rf);
+ if (!task_on_cpu(rq, p)) {
+ /*
+ * When on_rq && !on_cpu the task is preempted, see if
+ * it should preempt the task that is current now.
+ */
+ update_rq_clock(rq);
+ check_preempt_curr(rq, p, wake_flags);
+ }
+ ttwu_do_wakeup(p);
ret = 1;
}
__task_rq_unlock(rq, &rf);
@@ -4135,8 +4146,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
goto out;
trace_sched_waking(p);
- WRITE_ONCE(p->__state, TASK_RUNNING);
- trace_sched_wakeup(p);
+ ttwu_do_wakeup(p);
goto out;
}
@@ -5101,6 +5111,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
+ switch_mm_cid(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
@@ -5331,6 +5342,11 @@ bool single_task_running(void)
}
EXPORT_SYMBOL(single_task_running);
+unsigned long long nr_context_switches_cpu(int cpu)
+{
+ return cpu_rq(cpu)->nr_switches;
+}
+
unsigned long long nr_context_switches(void)
{
int i;
@@ -6257,7 +6273,7 @@ static bool steal_cookie_task(int cpu, struct sched_domain *sd)
{
int i;
- for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
+ for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) {
if (i == cpu)
continue;
@@ -8290,12 +8306,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
if (retval)
goto out_put_task;
+ /*
+ * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
+ * alloc_user_cpus_ptr() returns NULL.
+ */
user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
- if (IS_ENABLED(CONFIG_SMP) && !user_mask) {
+ if (user_mask) {
+ cpumask_copy(user_mask, in_mask);
+ } else if (IS_ENABLED(CONFIG_SMP)) {
retval = -ENOMEM;
goto out_put_task;
}
- cpumask_copy(user_mask, in_mask);
+
ac = (struct affinity_context){
.new_mask = in_mask,
.user_mask = user_mask,
@@ -8392,14 +8414,14 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
if (len & (sizeof(unsigned long)-1))
return -EINVAL;
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
unsigned int retlen = min(len, cpumask_size());
- if (copy_to_user(user_mask_ptr, mask, retlen))
+ if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
ret = -EFAULT;
else
ret = retlen;
@@ -11356,3 +11378,53 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
{
trace_sched_update_nr_running_tp(rq, count);
}
+
+#ifdef CONFIG_SCHED_MM_CID
+void sched_mm_cid_exit_signals(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned long flags;
+
+ if (!mm)
+ return;
+ local_irq_save(flags);
+ mm_cid_put(mm, t->mm_cid);
+ t->mm_cid = -1;
+ t->mm_cid_active = 0;
+ local_irq_restore(flags);
+}
+
+void sched_mm_cid_before_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned long flags;
+
+ if (!mm)
+ return;
+ local_irq_save(flags);
+ mm_cid_put(mm, t->mm_cid);
+ t->mm_cid = -1;
+ t->mm_cid_active = 0;
+ local_irq_restore(flags);
+}
+
+void sched_mm_cid_after_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned long flags;
+
+ if (!mm)
+ return;
+ local_irq_save(flags);
+ t->mm_cid = mm_cid_get(mm);
+ t->mm_cid_active = 1;
+ local_irq_restore(flags);
+ rseq_set_notify_resume(t);
+}
+
+void sched_mm_cid_fork(struct task_struct *t)
+{
+ WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
+ t->mm_cid_active = 1;
+}
+#endif
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 1207c78f85c1..e3211455b203 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -48,7 +48,6 @@ struct sugov_cpu {
unsigned long util;
unsigned long bw_dl;
- unsigned long max;
/* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON
@@ -158,7 +157,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
- sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
FREQUENCY_UTIL, NULL);
@@ -238,6 +236,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* sugov_iowait_apply() - Apply the IO boost to a CPU.
* @sg_cpu: the sugov data for the cpu to boost
* @time: the update time from the caller
+ * @max_cap: the max CPU capacity
*
* A CPU running a task which woken up after an IO operation can have its
* utilization boosted to speed up the completion of those IO operations.
@@ -251,7 +250,8 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* This mechanism is designed to boost high frequently IO waiting tasks, while
* being more conservative on tasks which does sporadic IO operations.
*/
-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
+static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned long max_cap)
{
unsigned long boost;
@@ -280,7 +280,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
* sg_cpu->util is already in capacity scale; convert iowait_boost
* into the same scale so we can compare.
*/
- boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
+ boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
if (sg_cpu->util < boost)
sg_cpu->util = boost;
@@ -310,7 +310,8 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
}
static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
- u64 time, unsigned int flags)
+ u64 time, unsigned long max_cap,
+ unsigned int flags)
{
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
@@ -321,7 +322,7 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
return false;
sugov_get_util(sg_cpu);
- sugov_iowait_apply(sg_cpu, time);
+ sugov_iowait_apply(sg_cpu, time, max_cap);
return true;
}
@@ -332,12 +333,15 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned int cached_freq = sg_policy->cached_raw_freq;
+ unsigned long max_cap;
unsigned int next_f;
- if (!sugov_update_single_common(sg_cpu, time, flags))
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
return;
- next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
+ next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
@@ -374,6 +378,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
unsigned long prev_util = sg_cpu->util;
+ unsigned long max_cap;
/*
* Fall back to the "frequency" path if frequency invariance is not
@@ -385,7 +390,9 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
return;
}
- if (!sugov_update_single_common(sg_cpu, time, flags))
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
return;
/*
@@ -399,7 +406,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
sg_cpu->util = prev_util;
cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
- map_util_perf(sg_cpu->util), sg_cpu->max);
+ map_util_perf(sg_cpu->util), max_cap);
sg_cpu->sg_policy->last_freq_update_time = time;
}
@@ -408,25 +415,21 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned long util = 0, max = 1;
+ unsigned long util = 0, max_cap;
unsigned int j;
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
- unsigned long j_util, j_max;
sugov_get_util(j_sg_cpu);
- sugov_iowait_apply(j_sg_cpu, time);
- j_util = j_sg_cpu->util;
- j_max = j_sg_cpu->max;
+ sugov_iowait_apply(j_sg_cpu, time, max_cap);
- if (j_util * max > j_max * util) {
- util = j_util;
- max = j_max;
- }
+ util = max(j_sg_cpu->util, util);
}
- return get_next_freq(sg_policy, util, max);
+ return get_next_freq(sg_policy, util, max_cap);
}
static void
@@ -543,7 +546,7 @@ static void sugov_tunables_free(struct kobject *kobj)
kfree(to_sugov_tunables(attr_set));
}
-static struct kobj_type sugov_tunables_ktype = {
+static const struct kobj_type sugov_tunables_ktype = {
.default_groups = sugov_groups,
.sysfs_ops = &governor_sysfs_ops,
.release = &sugov_tunables_free,
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 95fc77853743..af7952f12e6c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -3,6 +3,10 @@
* Simple CPU accounting cgroup controller
*/
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ #include <asm/cputime.h>
+#endif
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0d97d54276cc..71b24371a6f7 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2663,17 +2663,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
static void prio_changed_dl(struct rq *rq, struct task_struct *p,
int oldprio)
{
- if (task_on_rq_queued(p) || task_current(rq, p)) {
+ if (!task_on_rq_queued(p))
+ return;
+
#ifdef CONFIG_SMP
- /*
- * This might be too much, but unfortunately
- * we don't have the old deadline value, and
- * we can't argue if the task is increasing
- * or lowering its prio, so...
- */
- if (!rq->dl.overloaded)
- deadline_queue_pull_task(rq);
+ /*
+ * This might be too much, but unfortunately
+ * we don't have the old deadline value, and
+ * we can't argue if the task is increasing
+ * or lowering its prio, so...
+ */
+ if (!rq->dl.overloaded)
+ deadline_queue_pull_task(rq);
+ if (task_current(rq, p)) {
/*
* If we now have a earlier deadline task than p,
* then reschedule, provided p is still on this
@@ -2681,15 +2684,24 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
*/
if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
resched_curr(rq);
-#else
+ } else {
/*
- * Again, we don't know if p has a earlier
- * or later deadline, so let's blindly set a
- * (maybe not needed) rescheduling point.
+ * Current may not be deadline in case p was throttled but we
+ * have just replenished it (e.g. rt_mutex_setprio()).
+ *
+ * Otherwise, if p was given an earlier deadline, reschedule.
*/
- resched_curr(rq);
-#endif /* CONFIG_SMP */
+ if (!dl_task(rq->curr) ||
+ dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
+ resched_curr(rq);
}
+#else
+ /*
+ * We don't know if p has a earlier or later deadline, so let's blindly
+ * set a (maybe not needed) rescheduling point.
+ */
+ resched_curr(rq);
+#endif
}
DEFINE_SCHED_CLASS(dl) = {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c36aa54ae071..7a1b1f855b96 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -468,7 +468,7 @@ is_same_group(struct sched_entity *se, struct sched_entity *pse)
return NULL;
}
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
+static inline struct sched_entity *parent_entity(const struct sched_entity *se)
{
return se->parent;
}
@@ -595,8 +595,8 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
return min_vruntime;
}
-static inline bool entity_before(struct sched_entity *a,
- struct sched_entity *b)
+static inline bool entity_before(const struct sched_entity *a,
+ const struct sched_entity *b)
{
return (s64)(a->vruntime - b->vruntime) < 0;
}
@@ -1804,7 +1804,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
- if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
+ if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
if (READ_ONCE(rq->numa_migrate_on) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
@@ -1836,7 +1836,7 @@ static void task_numa_assign(struct task_numa_env *env,
int start = env->dst_cpu;
/* Find alternative idle CPU. */
- for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
if (cpu == env->best_cpu || !idle_cpu(cpu) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
continue;
@@ -2938,11 +2938,11 @@ static void task_numa_work(struct callback_head *work)
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
u64 runtime = p->se.sum_exec_runtime;
- MA_STATE(mas, &mm->mm_mt, 0, 0);
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
long pages, virtpages;
+ struct vma_iterator vmi;
SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
@@ -2995,16 +2995,16 @@ static void task_numa_work(struct callback_head *work)
if (!mmap_read_trylock(mm))
return;
- mas_set(&mas, start);
- vma = mas_find(&mas, ULONG_MAX);
+ vma_iter_init(&vmi, mm, start);
+ vma = vma_next(&vmi);
if (!vma) {
reset_ptenuma_scan(p);
start = 0;
- mas_set(&mas, start);
- vma = mas_find(&mas, ULONG_MAX);
+ vma_iter_set(&vmi, start);
+ vma = vma_next(&vmi);
}
- for (; vma; vma = mas_find(&mas, ULONG_MAX)) {
+ do {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
continue;
@@ -3051,7 +3051,7 @@ static void task_numa_work(struct callback_head *work)
cond_resched();
} while (end != vma->vm_end);
- }
+ } for_each_vma(vmi, vma);
out:
/*
@@ -4476,17 +4476,9 @@ static inline int util_fits_cpu(unsigned long util,
*
* For uclamp_max, we can tolerate a drop in performance level as the
* goal is to cap the task. So it's okay if it's getting less.
- *
- * In case of capacity inversion we should honour the inverted capacity
- * for both uclamp_min and uclamp_max all the time.
*/
- capacity_orig = cpu_in_capacity_inversion(cpu);
- if (capacity_orig) {
- capacity_orig_thermal = capacity_orig;
- } else {
- capacity_orig = capacity_orig_of(cpu);
- capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
- }
+ capacity_orig = capacity_orig_of(cpu);
+ capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
/*
* We want to force a task to fit a cpu as implied by uclamp_max.
@@ -4561,8 +4553,8 @@ static inline int util_fits_cpu(unsigned long util,
* handle the case uclamp_min > uclamp_max.
*/
uclamp_min = min(uclamp_min, uclamp_max);
- if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
- fits = fits && (uclamp_min <= capacity_orig_thermal);
+ if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+ return -1;
return fits;
}
@@ -4572,7 +4564,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
unsigned long util = task_util_est(p);
- return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
+ /*
+ * Return true only if the cpu fully fits the task requirements, which
+ * include the utilization but also the performance hints.
+ */
+ return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -4656,6 +4652,7 @@ static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;
+ u64 sleep_time;
/*
* The 'current' period is already promised to the current tasks,
@@ -4685,8 +4682,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
vruntime -= thresh;
}
- /* ensure we never gain time by being placed backwards. */
- se->vruntime = max_vruntime(se->vruntime, vruntime);
+ /*
+ * Pull vruntime of the entity being placed to the base level of
+ * cfs_rq, to prevent boosting it if placed backwards. If the entity
+ * slept for a long time, don't even try to compare its vruntime with
+ * the base as it may be too far off and the comparison may get
+ * inversed due to s64 overflow.
+ */
+ sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
+ if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+ se->vruntime = vruntime;
+ else
+ se->vruntime = max_vruntime(se->vruntime, vruntime);
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -4896,7 +4903,13 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
struct sched_entity *se;
s64 delta;
- ideal_runtime = sched_slice(cfs_rq, curr);
+ /*
+ * When many tasks blow up the sched_period; it is possible that
+ * sched_slice() reports unusually large results (when many tasks are
+ * very light for example). Therefore impose a maximum.
+ */
+ ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
+
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
resched_curr(rq_of(cfs_rq));
@@ -5461,22 +5474,105 @@ unthrottle_throttle:
resched_curr(rq);
}
-static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+#ifdef CONFIG_SMP
+static void __cfsb_csd_unthrottle(void *arg)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cursor, *tmp;
+ struct rq *rq = arg;
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+
+ /*
+ * Since we hold rq lock we're safe from concurrent manipulation of
+ * the CSD list. However, this RCU critical section annotates the
+ * fact that we pair with sched_free_group_rcu(), so that we cannot
+ * race with group being freed in the window between removing it
+ * from the list and advancing to the next entry in the list.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
+ throttled_csd_list) {
+ list_del_init(&cursor->throttled_csd_list);
+
+ if (cfs_rq_throttled(cursor))
+ unthrottle_cfs_rq(cursor);
+ }
+
+ rcu_read_unlock();
+
+ rq_unlock(rq, &rf);
+}
+
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ bool first;
+
+ if (rq == this_rq()) {
+ unthrottle_cfs_rq(cfs_rq);
+ return;
+ }
+
+ /* Already enqueued */
+ if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+ return;
+
+ first = list_empty(&rq->cfsb_csd_list);
+ list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
+ if (first)
+ smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
+}
+#else
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ unthrottle_cfs_rq(cfs_rq);
+}
+#endif
+
+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ lockdep_assert_rq_held(rq_of(cfs_rq));
+
+ if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+ cfs_rq->runtime_remaining <= 0))
+ return;
+
+ __unthrottle_cfs_rq_async(cfs_rq);
+}
+
+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+{
+ struct cfs_rq *local_unthrottle = NULL;
+ int this_cpu = smp_processor_id();
u64 runtime, remaining = 1;
+ bool throttled = false;
+ struct cfs_rq *cfs_rq;
+ struct rq_flags rf;
+ struct rq *rq;
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
- struct rq *rq = rq_of(cfs_rq);
- struct rq_flags rf;
+ rq = rq_of(cfs_rq);
+
+ if (!remaining) {
+ throttled = true;
+ break;
+ }
rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
- /* By the above check, this should never be true */
+#ifdef CONFIG_SMP
+ /* Already queued for async unthrottle */
+ if (!list_empty(&cfs_rq->throttled_csd_list))
+ goto next;
+#endif
+
+ /* By the above checks, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock);
@@ -5490,16 +5586,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
cfs_rq->runtime_remaining += runtime;
/* we check whether we're throttled above */
- if (cfs_rq->runtime_remaining > 0)
- unthrottle_cfs_rq(cfs_rq);
+ if (cfs_rq->runtime_remaining > 0) {
+ if (cpu_of(rq) != this_cpu ||
+ SCHED_WARN_ON(local_unthrottle))
+ unthrottle_cfs_rq_async(cfs_rq);
+ else
+ local_unthrottle = cfs_rq;
+ } else {
+ throttled = true;
+ }
next:
rq_unlock_irqrestore(rq, &rf);
-
- if (!remaining)
- break;
}
rcu_read_unlock();
+
+ if (local_unthrottle) {
+ rq = cpu_rq(this_cpu);
+ rq_lock_irqsave(rq, &rf);
+ if (cfs_rq_throttled(local_unthrottle))
+ unthrottle_cfs_rq(local_unthrottle);
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
+ return throttled;
}
/*
@@ -5544,10 +5654,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- distribute_cfs_runtime(cfs_b);
+ throttled = distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
-
- throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}
/*
@@ -5824,6 +5932,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
+#ifdef CONFIG_SMP
+ INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+#endif
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5840,12 +5951,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
+ int __maybe_unused i;
+
/* init_cfs_bandwidth() was not called */
if (!cfs_b->throttled_cfs_rq.next)
return;
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
+
+ /*
+ * It is possible that we still have some cfs_rq's pending on a CSD
+ * list, though this race is very rare. In order for this to occur, we
+ * must have raced with the last task leaving the group while there
+ * exist throttled cfs_rq(s), and the period_timer must have queued the
+ * CSD item but the remote cpu has not yet processed it. To handle this,
+ * we can simply flush all pending CSD work inline here. We're
+ * guaranteed at this point that no additional cfs_rq of this group can
+ * join a CSD list.
+ */
+#ifdef CONFIG_SMP
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ unsigned long flags;
+
+ if (list_empty(&rq->cfsb_csd_list))
+ continue;
+
+ local_irq_save(flags);
+ __cfsb_csd_unthrottle(rq);
+ local_irq_restore(flags);
+ }
+#endif
}
/*
@@ -6008,6 +6145,7 @@ static inline bool cpu_overutilized(int cpu)
unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+ /* Return true only if the utilization doesn't fit CPU's capacity */
return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}
@@ -6801,6 +6939,7 @@ static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
unsigned long task_util, util_min, util_max, best_cap = 0;
+ int fits, best_fits = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;
@@ -6811,17 +6950,33 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
util_min = uclamp_eff_value(p, UCLAMP_MIN);
util_max = uclamp_eff_value(p, UCLAMP_MAX);
- for_each_cpu_wrap(cpu, cpus, target) {
+ for_each_cpu_wrap(cpu, cpus, target + 1) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
- if (util_fits_cpu(task_util, util_min, util_max, cpu))
+
+ fits = util_fits_cpu(task_util, util_min, util_max, cpu);
+
+ /* This CPU fits with all requirements */
+ if (fits > 0)
return cpu;
+ /*
+ * Only the min performance hint (i.e. uclamp_min) doesn't fit.
+ * Look for the CPU with best capacity.
+ */
+ else if (fits < 0)
+ cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
- if (cpu_cap > best_cap) {
+ /*
+ * First, select CPU which fits better (-1 being better than 0).
+ * Then, select the one with best capacity at same level.
+ */
+ if ((fits < best_fits) ||
+ ((fits == best_fits) && (cpu_cap > best_cap))) {
best_cap = cpu_cap;
best_cpu = cpu;
+ best_fits = fits;
}
}
@@ -6834,7 +6989,11 @@ static inline bool asym_fits_cpu(unsigned long util,
int cpu)
{
if (sched_asym_cpucap_active())
- return util_fits_cpu(util, util_min, util_max, cpu);
+ /*
+ * Return true only if the cpu fully fits the task requirements
+ * which include the utilization and the performance hints.
+ */
+ return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
return true;
}
@@ -7201,6 +7360,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
struct root_domain *rd = this_rq()->rd;
int cpu, best_energy_cpu, target = -1;
+ int prev_fits = -1, best_fits = -1;
+ unsigned long best_thermal_cap = 0;
+ unsigned long prev_thermal_cap = 0;
struct sched_domain *sd;
struct perf_domain *pd;
struct energy_env eenv;
@@ -7229,13 +7391,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
eenv_task_busy_time(&eenv, p, prev_cpu);
for (; pd; pd = pd->next) {
+ unsigned long util_min = p_util_min, util_max = p_util_max;
unsigned long cpu_cap, cpu_thermal_cap, util;
unsigned long cur_delta, max_spare_cap = 0;
unsigned long rq_util_min, rq_util_max;
- unsigned long util_min, util_max;
unsigned long prev_spare_cap = 0;
int max_spare_cap_cpu = -1;
unsigned long base_energy;
+ int fits, max_fits = -1;
cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
@@ -7251,6 +7414,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
eenv.pd_cap = 0;
for_each_cpu(cpu, cpus) {
+ struct rq *rq = cpu_rq(cpu);
+
eenv.pd_cap += cpu_thermal_cap;
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
@@ -7269,26 +7434,23 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* much capacity we can get out of the CPU; this is
* aligned with sched_cpu_util().
*/
- if (uclamp_is_used()) {
- if (uclamp_rq_is_idle(cpu_rq(cpu))) {
- util_min = p_util_min;
- util_max = p_util_max;
- } else {
- /*
- * Open code uclamp_rq_util_with() except for
- * the clamp() part. Ie: apply max aggregation
- * only. util_fits_cpu() logic requires to
- * operate on non clamped util but must use the
- * max-aggregated uclamp_{min, max}.
- */
- rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
- rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
-
- util_min = max(rq_util_min, p_util_min);
- util_max = max(rq_util_max, p_util_max);
- }
+ if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
+ /*
+ * Open code uclamp_rq_util_with() except for
+ * the clamp() part. Ie: apply max aggregation
+ * only. util_fits_cpu() logic requires to
+ * operate on non clamped util but must use the
+ * max-aggregated uclamp_{min, max}.
+ */
+ rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
+
+ util_min = max(rq_util_min, p_util_min);
+ util_max = max(rq_util_max, p_util_max);
}
- if (!util_fits_cpu(util, util_min, util_max, cpu))
+
+ fits = util_fits_cpu(util, util_min, util_max, cpu);
+ if (!fits)
continue;
lsub_positive(&cpu_cap, util);
@@ -7296,7 +7458,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (cpu == prev_cpu) {
/* Always use prev_cpu as a candidate. */
prev_spare_cap = cpu_cap;
- } else if (cpu_cap > max_spare_cap) {
+ prev_fits = fits;
+ } else if ((fits > max_fits) ||
+ ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
/*
* Find the CPU with the maximum spare capacity
* among the remaining CPUs in the performance
@@ -7304,6 +7468,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
*/
max_spare_cap = cpu_cap;
max_spare_cap_cpu = cpu;
+ max_fits = fits;
}
}
@@ -7322,26 +7487,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (prev_delta < base_energy)
goto unlock;
prev_delta -= base_energy;
+ prev_thermal_cap = cpu_thermal_cap;
best_delta = min(best_delta, prev_delta);
}
/* Evaluate the energy impact of using max_spare_cap_cpu. */
if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
+ /* Current best energy cpu fits better */
+ if (max_fits < best_fits)
+ continue;
+
+ /*
+ * Both don't fit performance hint (i.e. uclamp_min)
+ * but best energy cpu has better capacity.
+ */
+ if ((max_fits < 0) &&
+ (cpu_thermal_cap <= best_thermal_cap))
+ continue;
+
cur_delta = compute_energy(&eenv, pd, cpus, p,
max_spare_cap_cpu);
/* CPU utilization has changed */
if (cur_delta < base_energy)
goto unlock;
cur_delta -= base_energy;
- if (cur_delta < best_delta) {
- best_delta = cur_delta;
- best_energy_cpu = max_spare_cap_cpu;
- }
+
+ /*
+ * Both fit for the task but best energy cpu has lower
+ * energy impact.
+ */
+ if ((max_fits > 0) && (best_fits > 0) &&
+ (cur_delta >= best_delta))
+ continue;
+
+ best_delta = cur_delta;
+ best_energy_cpu = max_spare_cap_cpu;
+ best_fits = max_fits;
+ best_thermal_cap = cpu_thermal_cap;
}
}
rcu_read_unlock();
- if (best_delta < prev_delta)
+ if ((best_fits > prev_fits) ||
+ ((best_fits > 0) && (best_delta < prev_delta)) ||
+ ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
target = best_energy_cpu;
return target;
@@ -8841,73 +9030,16 @@ static unsigned long scale_rt_capacity(int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);
unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
- struct rq *rq = cpu_rq(cpu);
- rq->cpu_capacity_orig = capacity_orig;
+ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
if (!capacity)
capacity = 1;
- rq->cpu_capacity = capacity;
-
- /*
- * Detect if the performance domain is in capacity inversion state.
- *
- * Capacity inversion happens when another perf domain with equal or
- * lower capacity_orig_of() ends up having higher capacity than this
- * domain after subtracting thermal pressure.
- *
- * We only take into account thermal pressure in this detection as it's
- * the only metric that actually results in *real* reduction of
- * capacity due to performance points (OPPs) being dropped/become
- * unreachable due to thermal throttling.
- *
- * We assume:
- * * That all cpus in a perf domain have the same capacity_orig
- * (same uArch).
- * * Thermal pressure will impact all cpus in this perf domain
- * equally.
- */
- if (static_branch_unlikely(&sched_asym_cpucapacity)) {
- unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
- struct perf_domain *pd = rcu_dereference(rq->rd->pd);
-
- rq->cpu_capacity_inverted = 0;
-
- for (; pd; pd = pd->next) {
- struct cpumask *pd_span = perf_domain_span(pd);
- unsigned long pd_cap_orig, pd_cap;
-
- cpu = cpumask_any(pd_span);
- pd_cap_orig = arch_scale_cpu_capacity(cpu);
-
- if (capacity_orig < pd_cap_orig)
- continue;
-
- /*
- * handle the case of multiple perf domains have the
- * same capacity_orig but one of them is under higher
- * thermal pressure. We record it as capacity
- * inversion.
- */
- if (capacity_orig == pd_cap_orig) {
- pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu));
-
- if (pd_cap > inv_cap) {
- rq->cpu_capacity_inverted = inv_cap;
- break;
- }
- } else if (pd_cap_orig > inv_cap) {
- rq->cpu_capacity_inverted = inv_cap;
- break;
- }
- }
- }
-
- trace_sched_cpu_capacity_tp(rq);
+ cpu_rq(cpu)->cpu_capacity = capacity;
+ trace_sched_cpu_capacity_tp(cpu_rq(cpu));
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
@@ -10135,24 +10267,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
*/
update_sd_lb_stats(env, &sds);
- if (sched_energy_enabled()) {
- struct root_domain *rd = env->dst_rq->rd;
-
- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
- goto out_balanced;
- }
-
- local = &sds.local_stat;
- busiest = &sds.busiest_stat;
-
/* There is no busy sibling group to pull tasks from */
if (!sds.busiest)
goto out_balanced;
+ busiest = &sds.busiest_stat;
+
/* Misfit tasks should be dealt with regardless of the avg load */
if (busiest->group_type == group_misfit_task)
goto force_balance;
+ if (sched_energy_enabled()) {
+ struct root_domain *rd = env->dst_rq->rd;
+
+ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
+ goto out_balanced;
+ }
+
/* ASYM feature bypasses nice load balance check */
if (busiest->group_type == group_asym_packing)
goto force_balance;
@@ -10165,6 +10296,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_imbalanced)
goto force_balance;
+ local = &sds.local_stat;
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
@@ -11728,7 +11860,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
/*
* se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
*/
-static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
+static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
+ bool forceidle)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -11753,11 +11886,12 @@ void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
}
-bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool in_fi)
{
struct rq *rq = task_rq(a);
- struct sched_entity *sea = &a->se;
- struct sched_entity *seb = &b->se;
+ const struct sched_entity *sea = &a->se;
+ const struct sched_entity *seb = &b->se;
struct cfs_rq *cfs_rqa;
struct cfs_rq *cfs_rqb;
s64 delta;
@@ -12474,6 +12608,11 @@ __init void init_sched_fair_class(void)
for_each_possible_cpu(i) {
zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
+ INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
+#endif
}
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f26ab2675f7d..e9ef66be2870 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -51,18 +51,22 @@ __setup("hlt", cpu_idle_nopoll_setup);
static noinline int __cpuidle cpu_idle_poll(void)
{
+ instrumentation_begin();
trace_cpu_idle(0, smp_processor_id());
stop_critical_timings();
- ct_idle_enter();
- local_irq_enable();
+ ct_cpuidle_enter();
+ raw_local_irq_enable();
while (!tif_need_resched() &&
(cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
+ raw_local_irq_disable();
- ct_idle_exit();
+ ct_cpuidle_exit();
start_critical_timings();
trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ local_irq_enable();
+ instrumentation_end();
return 1;
}
@@ -75,7 +79,6 @@ void __weak arch_cpu_idle_dead(void) { }
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
- raw_local_irq_enable();
}
/**
@@ -85,44 +88,20 @@ void __weak arch_cpu_idle(void)
*/
void __cpuidle default_idle_call(void)
{
- if (current_clr_polling_and_test()) {
- local_irq_enable();
- } else {
-
+ instrumentation_begin();
+ if (!current_clr_polling_and_test()) {
trace_cpu_idle(1, smp_processor_id());
stop_critical_timings();
- /*
- * arch_cpu_idle() is supposed to enable IRQs, however
- * we can't do that because of RCU and tracing.
- *
- * Trace IRQs enable here, then switch off RCU, and have
- * arch_cpu_idle() use raw_local_irq_enable(). Note that
- * ct_idle_enter() relies on lockdep IRQ state, so switch that
- * last -- this is very similar to the entry code.
- */
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare();
- ct_idle_enter();
- lockdep_hardirqs_on(_THIS_IP_);
-
+ ct_cpuidle_enter();
arch_cpu_idle();
-
- /*
- * OK, so IRQs are enabled here, but RCU needs them disabled to
- * turn itself back on.. funny thing is that disabling IRQs
- * will cause tracing, which needs RCU. Jump through hoops to
- * make it 'work'.
- */
- raw_local_irq_disable();
- lockdep_hardirqs_off(_THIS_IP_);
- ct_idle_exit();
- lockdep_hardirqs_on(_THIS_IP_);
- raw_local_irq_enable();
+ ct_cpuidle_exit();
start_critical_timings();
trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
+ local_irq_enable();
+ instrumentation_end();
}
static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 0c5be7ebb1dc..2ad881d07752 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -159,7 +159,8 @@
| MEMBARRIER_CMD_PRIVATE_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
- | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
+ | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
+ | MEMBARRIER_CMD_GET_REGISTRATIONS)
static void ipi_mb(void *info)
{
@@ -540,6 +541,40 @@ static int membarrier_register_private_expedited(int flags)
return 0;
}
+static int membarrier_get_registrations(void)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ int registrations_mask = 0, membarrier_state, i;
+ static const int states[] = {
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED |
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY
+ };
+ static const int registration_cmds[] = {
+ MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ
+ };
+ BUILD_BUG_ON(ARRAY_SIZE(states) != ARRAY_SIZE(registration_cmds));
+
+ membarrier_state = atomic_read(&mm->membarrier_state);
+ for (i = 0; i < ARRAY_SIZE(states); ++i) {
+ if (membarrier_state & states[i]) {
+ registrations_mask |= registration_cmds[i];
+ membarrier_state &= ~states[i];
+ }
+ }
+ WARN_ON_ONCE(membarrier_state != 0);
+ return registrations_mask;
+}
+
/**
* sys_membarrier - issue memory barriers on a set of threads
* @cmd: Takes command values defined in enum membarrier_cmd.
@@ -623,6 +658,8 @@ SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
+ case MEMBARRIER_CMD_GET_REGISTRATIONS:
+ return membarrier_get_registrations();
default:
return -EINVAL;
}
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 8ac8b81bfee6..02e011cabe91 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1343,10 +1343,11 @@ void psi_trigger_destroy(struct psi_trigger *t)
group = t->group;
/*
- * Wakeup waiters to stop polling. Can happen if cgroup is deleted
- * from under a polling process.
+ * Wakeup waiters to stop polling and clear the queue to prevent it from
+ * being accessed later. Can happen if cgroup is deleted from under a
+ * polling process.
*/
- wake_up_interruptible(&t->event_wait);
+ wake_up_pollfree(&t->event_wait);
mutex_lock(&group->trigger_lock);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ed2a47e4ddae..0a11f44adee5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1777,6 +1777,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
BUG_ON(idx >= MAX_RT_PRIO);
queue = array->queue + idx;
+ if (SCHED_WARN_ON(list_empty(queue)))
+ return NULL;
next = list_entry(queue->next, struct sched_rt_entity, run_list);
return next;
@@ -1789,7 +1791,8 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
do {
rt_se = pick_next_rt_entity(rt_rq);
- BUG_ON(!rt_se);
+ if (unlikely(!rt_se))
+ return NULL;
rt_rq = group_rt_rq(rt_se);
} while (rt_rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 771f8ddb7053..3e8df6d31c1e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -248,7 +248,7 @@ static inline void update_avg(u64 *avg, u64 sample)
#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
-static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
+static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se)
{
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
return unlikely(dl_se->flags & SCHED_FLAG_SUGOV);
@@ -260,8 +260,8 @@ static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
/*
* Tells if entity @a should preempt entity @b.
*/
-static inline bool
-dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
+static inline bool dl_entity_preempt(const struct sched_dl_entity *a,
+ const struct sched_dl_entity *b)
{
return dl_entity_is_special(a) ||
dl_time_before(a->deadline, b->deadline);
@@ -645,6 +645,9 @@ struct cfs_rq {
int throttled;
int throttle_count;
struct list_head throttled_list;
+#ifdef CONFIG_SMP
+ struct list_head throttled_csd_list;
+#endif
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -1041,7 +1044,6 @@ struct rq {
unsigned long cpu_capacity;
unsigned long cpu_capacity_orig;
- unsigned long cpu_capacity_inverted;
struct balance_callback *balance_callback;
@@ -1154,6 +1156,11 @@ struct rq {
/* Scratch cpumask to be temporarily used under rq_lock */
cpumask_var_t scratch_mask;
+
+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
+ call_single_data_t cfsb_csd;
+ struct list_head cfsb_csd_list;
+#endif
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1236,7 +1243,8 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
return &rq->__lock;
}
-bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi);
+bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool fi);
/*
* Helpers to check if the CPU's core cookie matches with the task's cookie
@@ -1415,7 +1423,7 @@ static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
}
/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se)
{
return se->cfs_rq;
}
@@ -1428,19 +1436,16 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
#else
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
- return container_of(se, struct task_struct, se);
-}
+#define task_of(_se) container_of(_se, struct task_struct, se)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p)
{
return &task_rq(p)->cfs;
}
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se)
{
- struct task_struct *p = task_of(se);
+ const struct task_struct *p = task_of(se);
struct rq *rq = task_rq(p);
return &rq->cfs;
@@ -2893,24 +2898,6 @@ static inline unsigned long capacity_orig_of(int cpu)
return cpu_rq(cpu)->cpu_capacity_orig;
}
-/*
- * Returns inverted capacity if the CPU is in capacity inversion state.
- * 0 otherwise.
- *
- * Capacity inversion detection only considers thermal impact where actual
- * performance points (OPPs) gets dropped.
- *
- * Capacity inversion state happens when another performance domain that has
- * equal or lower capacity_orig_of() becomes effectively larger than the perf
- * domain this CPU belongs to due to thermal pressure throttling it hard.
- *
- * See comment in update_cpu_capacity().
- */
-static inline unsigned long cpu_in_capacity_inversion(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_inverted;
-}
-
/**
* enum cpu_util_type - CPU utilization type
* @FREQUENCY_UTIL: Utilization used to select frequency
@@ -3261,4 +3248,62 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
cgroup_account_cputime(curr, delta_exec);
}
+#ifdef CONFIG_SCHED_MM_CID
+static inline int __mm_cid_get(struct mm_struct *mm)
+{
+ struct cpumask *cpumask;
+ int cid;
+
+ cpumask = mm_cidmask(mm);
+ cid = cpumask_first_zero(cpumask);
+ if (cid >= nr_cpu_ids)
+ return -1;
+ __cpumask_set_cpu(cid, cpumask);
+ return cid;
+}
+
+static inline void mm_cid_put(struct mm_struct *mm, int cid)
+{
+ lockdep_assert_irqs_disabled();
+ if (cid < 0)
+ return;
+ raw_spin_lock(&mm->cid_lock);
+ __cpumask_clear_cpu(cid, mm_cidmask(mm));
+ raw_spin_unlock(&mm->cid_lock);
+}
+
+static inline int mm_cid_get(struct mm_struct *mm)
+{
+ int ret;
+
+ lockdep_assert_irqs_disabled();
+ raw_spin_lock(&mm->cid_lock);
+ ret = __mm_cid_get(mm);
+ raw_spin_unlock(&mm->cid_lock);
+ return ret;
+}
+
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
+{
+ if (prev->mm_cid_active) {
+ if (next->mm_cid_active && next->mm == prev->mm) {
+ /*
+ * Context switch between threads in same mm, hand over
+ * the mm_cid from prev to next.
+ */
+ next->mm_cid = prev->mm_cid;
+ prev->mm_cid = -1;
+ return;
+ }
+ mm_cid_put(prev->mm, prev->mm_cid);
+ prev->mm_cid = -1;
+ }
+ if (next->mm_cid_active)
+ next->mm_cid = mm_cid_get(next->mm);
+}
+
+#else
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
+#endif
+
#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8739c2a5a54e..051aaf65c749 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,8 @@
* Scheduler topology setup/handling methods
*/
+#include <linux/bsearch.h>
+
DEFINE_MUTEX(sched_domains_mutex);
/* Protected by sched_domains_mutex: */
@@ -578,7 +580,7 @@ out:
*/
struct root_domain def_root_domain;
-void init_defrootdomain(void)
+void __init init_defrootdomain(void)
{
init_rootdomain(&def_root_domain);
@@ -2067,6 +2069,99 @@ unlock:
return found;
}
+struct __cmp_key {
+ const struct cpumask *cpus;
+ struct cpumask ***masks;
+ int node;
+ int cpu;
+ int w;
+};
+
+static int hop_cmp(const void *a, const void *b)
+{
+ struct cpumask **prev_hop, **cur_hop = *(struct cpumask ***)b;
+ struct __cmp_key *k = (struct __cmp_key *)a;
+
+ if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
+ return 1;
+
+ if (b == k->masks) {
+ k->w = 0;
+ return 0;
+ }
+
+ prev_hop = *((struct cpumask ***)b - 1);
+ k->w = cpumask_weight_and(k->cpus, prev_hop[k->node]);
+ if (k->w <= k->cpu)
+ return 0;
+
+ return -1;
+}
+
+/*
+ * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
+ * closest to @cpu from @cpumask.
+ * cpumask: cpumask to find a cpu from
+ * cpu: Nth cpu to find
+ *
+ * returns: cpu, or nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
+{
+ struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
+ struct cpumask ***hop_masks;
+ int hop, ret = nr_cpu_ids;
+
+ rcu_read_lock();
+
+ k.masks = rcu_dereference(sched_domains_numa_masks);
+ if (!k.masks)
+ goto unlock;
+
+ hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
+ hop = hop_masks - k.masks;
+
+ ret = hop ?
+ cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
+ cpumask_nth_and(cpu, cpus, k.masks[0][node]);
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
+
+/**
+ * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
+ * @node
+ * @node: The node to count hops from.
+ * @hops: Include CPUs up to that many hops away. 0 means local node.
+ *
+ * Return: On success, a pointer to a cpumask of CPUs at most @hops away from
+ * @node, an error value otherwise.
+ *
+ * Requires rcu_lock to be held. Returned cpumask is only valid within that
+ * read-side section, copy it if required beyond that.
+ *
+ * Note that not all hops are equal in distance; see sched_init_numa() for how
+ * distances and masks are handled.
+ * Also note that this is a reflection of sched_domains_numa_masks, which may change
+ * during the lifetime of the system (offline nodes are taken out of the masks).
+ */
+const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops)
+{
+ struct cpumask ***masks;
+
+ if (node >= nr_node_ids || hops >= sched_domains_numa_levels)
+ return ERR_PTR(-EINVAL);
+
+ masks = rcu_dereference(sched_domains_numa_masks);
+ if (!masks)
+ return ERR_PTR(-EBUSY);
+
+ return masks[hops][node];
+}
+EXPORT_SYMBOL_GPL(sched_numa_hop_mask);
+
#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -2451,7 +2546,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
* Set up scheduler domains and groups. For now this just excludes isolated
* CPUs, but could be used to exclude other special cases in the future.
*/
-int sched_init_domains(const struct cpumask *cpu_map)
+int __init sched_init_domains(const struct cpumask *cpu_map)
{
int err;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e9852d1b4a5e..cebf26445f9e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -388,6 +388,7 @@ static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilte
}
#endif /* SECCOMP_ARCH_NATIVE */
+#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
/**
* seccomp_run_filters - evaluates all seccomp filters against @sd
* @sd: optional seccomp data to be passed to filters
@@ -397,7 +398,6 @@ static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilte
*
* Returns valid seccomp BPF response codes.
*/
-#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
static u32 seccomp_run_filters(const struct seccomp_data *sd,
struct seccomp_filter **match)
{
diff --git a/kernel/signal.c b/kernel/signal.c
index ae26da61c4d9..8cb28f1df294 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2951,6 +2951,7 @@ void exit_signals(struct task_struct *tsk)
cgroup_threadgroup_change_begin(tsk);
if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
+ sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
return;
@@ -2961,6 +2962,7 @@ void exit_signals(struct task_struct *tsk)
* From now this task is not visible for group-wide signals,
* see wants_signal(), do_signal_stop().
*/
+ sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
diff --git a/kernel/sys.c b/kernel/sys.c
index 88b31f096fb2..495cd87d9bf4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2350,6 +2350,33 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,
}
#endif /* CONFIG_ANON_VMA_NAME */
+static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN))
+ return -EINVAL;
+
+ if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
+ set_bit(MMF_HAS_MDWE, &current->mm->flags);
+ else if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
+ return -EPERM; /* Cannot unset the flag */
+
+ return 0;
+}
+
+static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ return test_bit(MMF_HAS_MDWE, &current->mm->flags) ?
+ PR_MDWE_REFUSE_EXEC_GAIN : 0;
+}
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2625,6 +2652,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
break;
#endif
+ case PR_SET_MDWE:
+ error = prctl_set_mdwe(arg2, arg3, arg4, arg5);
+ break;
+ case PR_GET_MDWE:
+ error = prctl_get_mdwe(arg2, arg3, arg4, arg5);
+ break;
case PR_SET_VMA:
error = prctl_set_vma(arg2, arg3, arg4, arg5);
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 137d4abe3eda..1c240d2c99bc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -425,21 +425,6 @@ static void proc_put_char(void **buf, size_t *size, char c)
}
}
-static int do_proc_dobool_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- *(bool *)valp = *lvalp;
- } else {
- int val = *(bool *)valp;
-
- *lvalp = (unsigned long)val;
- *negp = false;
- }
- return 0;
-}
-
static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
@@ -710,16 +695,36 @@ int do_proc_douintvec(struct ctl_table *table, int write,
* @lenp: the size of the user buffer
* @ppos: file position
*
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
+ * Reads/writes one integer value from/to the user buffer,
+ * treated as an ASCII string.
+ *
+ * table->data must point to a bool variable and table->maxlen must
+ * be sizeof(bool).
*
* Returns 0 on success.
*/
int proc_dobool(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_dobool_conv, NULL);
+ struct ctl_table tmp;
+ bool *data = table->data;
+ int res, val;
+
+ /* Do not support arrays yet. */
+ if (table->maxlen != sizeof(bool))
+ return -EINVAL;
+
+ tmp = *table;
+ tmp.maxlen = sizeof(val);
+ tmp.data = &val;
+
+ val = READ_ONCE(*data);
+ res = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ if (res)
+ return res;
+ if (write)
+ WRITE_ONCE(*data, val);
+ return 0;
}
/**
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index a41753be1a2b..bae8f11070be 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -200,10 +200,14 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
int "Clocksource watchdog maximum allowable skew (in μs)"
depends on CLOCKSOURCE_WATCHDOG
range 50 1000
- default 100
+ default 125
help
Specify the maximum amount of allowable watchdog skew in
microseconds before reporting the clocksource to be unstable.
+ The default is based on a half-second clocksource watchdog
+ interval and NTP's maximum frequency drift of 500 parts
+ per million. If the clocksource is good enough for NTP,
+ it is good enough for the clocksource watchdog!
endmenu
endif
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 5897828b9d7e..7e5dff602585 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -470,11 +470,35 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
}
EXPORT_SYMBOL_GPL(alarm_forward);
-u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
+static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throttle)
{
struct alarm_base *base = &alarm_bases[alarm->type];
+ ktime_t now = base->get_ktime();
+
+ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && throttle) {
+ /*
+ * Same issue as with posix_timer_fn(). Timers which are
+ * periodic but the signal is ignored can starve the system
+ * with a very small interval. The real fix which was
+ * promised in the context of posix_timer_fn() never
+ * materialized, but someone should really work on it.
+ *
+ * To prevent DOS fake @now to be 1 jiffie out which keeps
+ * the overrun accounting correct but creates an
+ * inconsistency vs. timer_gettime(2).
+ */
+ ktime_t kj = NSEC_PER_SEC / HZ;
+
+ if (interval < kj)
+ now = ktime_add(now, kj);
+ }
+
+ return alarm_forward(alarm, now, interval);
+}
- return alarm_forward(alarm, base->get_ktime(), interval);
+u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
+{
+ return __alarm_forward_now(alarm, interval, false);
}
EXPORT_SYMBOL_GPL(alarm_forward_now);
@@ -551,9 +575,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
if (posix_timer_event(ptr, si_private) && ptr->it_interval) {
/*
* Handle ignored signals and rearm the timer. This will go
- * away once we handle ignored signals proper.
+ * away once we handle ignored signals proper. Ensure that
+ * small intervals cannot starve the system.
*/
- ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval);
+ ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true);
++ptr->it_requeue_pending;
ptr->it_active = 1;
result = ALARMTIMER_RESTART;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 9cf32ccda715..91836b727cef 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -96,6 +96,11 @@ static int finished_booting;
static u64 suspend_start;
/*
+ * Interval: 0.5sec.
+ */
+#define WATCHDOG_INTERVAL (HZ >> 1)
+
+/*
* Threshold: 0.0312s, when doubled: 0.0625s.
* Also a default for cs->uncertainty_margin when registering clocks.
*/
@@ -106,11 +111,14 @@ static u64 suspend_start;
* clocksource surrounding a read of the clocksource being validated.
* This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
* a lower bound for cs->uncertainty_margin values when registering clocks.
+ *
+ * The default of 500 parts per million is based on NTP's limits.
+ * If a clocksource is good enough for NTP, it is good enough for us!
*/
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
#else
-#define MAX_SKEW_USEC 100
+#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
#endif
#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
@@ -140,11 +148,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags)
static int clocksource_watchdog_kthread(void *data);
static void __clocksource_change_rating(struct clocksource *cs, int rating);
-/*
- * Interval: 0.5sec.
- */
-#define WATCHDOG_INTERVAL (HZ >> 1)
-
static void clocksource_watchdog_work(struct work_struct *work)
{
/*
@@ -257,8 +260,8 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
goto skip_test;
}
- pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
- smp_processor_id(), watchdog->name, wd_delay, nretries);
+ pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n",
+ smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);
return WD_READ_UNSTABLE;
skip_test:
@@ -384,6 +387,15 @@ void clocksource_verify_percpu(struct clocksource *cs)
}
EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
+static inline void clocksource_reset_watchdog(void)
+{
+ struct clocksource *cs;
+
+ list_for_each_entry(cs, &watchdog_list, wd_list)
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
+
static void clocksource_watchdog(struct timer_list *unused)
{
u64 csnow, wdnow, cslast, wdlast, delta;
@@ -391,6 +403,7 @@ static void clocksource_watchdog(struct timer_list *unused)
int64_t wd_nsec, cs_nsec;
struct clocksource *cs;
enum wd_read_status read_ret;
+ unsigned long extra_wait = 0;
u32 md;
spin_lock(&watchdog_lock);
@@ -410,13 +423,30 @@ static void clocksource_watchdog(struct timer_list *unused)
read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
- if (read_ret != WD_READ_SUCCESS) {
- if (read_ret == WD_READ_UNSTABLE)
- /* Clock readout unreliable, so give it up. */
- __clocksource_unstable(cs);
+ if (read_ret == WD_READ_UNSTABLE) {
+ /* Clock readout unreliable, so give it up. */
+ __clocksource_unstable(cs);
continue;
}
+ /*
+ * When WD_READ_SKIP is returned, it means the system is likely
+ * under very heavy load, where the latency of reading
+ * watchdog/clocksource is very big, and affect the accuracy of
+ * watchdog check. So give system some space and suspend the
+ * watchdog check for 5 minutes.
+ */
+ if (read_ret == WD_READ_SKIP) {
+ /*
+ * As the watchdog timer will be suspended, and
+ * cs->last could keep unchanged for 5 minutes, reset
+ * the counters.
+ */
+ clocksource_reset_watchdog();
+ extra_wait = HZ * 300;
+ break;
+ }
+
/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
atomic_read(&watchdog_reset_pending)) {
@@ -443,12 +473,20 @@ static void clocksource_watchdog(struct timer_list *unused)
/* Check the deviation from the watchdog clocksource. */
md = cs->uncertainty_margin + watchdog->uncertainty_margin;
if (abs(cs_nsec - wd_nsec) > md) {
+ u64 cs_wd_msec;
+ u64 wd_msec;
+ u32 wd_rem;
+
pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
smp_processor_id(), cs->name);
pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
cs->name, cs_nsec, csnow, cslast, cs->mask);
+ cs_wd_msec = div_u64_rem(cs_nsec - wd_nsec, 1000U * 1000U, &wd_rem);
+ wd_msec = div_u64_rem(wd_nsec, 1000U * 1000U, &wd_rem);
+ pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
+ cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
if (curr_clocksource == cs)
pr_warn(" '%s' is current clocksource.\n", cs->name);
else if (curr_clocksource)
@@ -512,7 +550,7 @@ static void clocksource_watchdog(struct timer_list *unused)
* pair clocksource_stop_watchdog() clocksource_start_watchdog().
*/
if (!timer_pending(&watchdog_timer)) {
- watchdog_timer.expires += WATCHDOG_INTERVAL;
+ watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
add_timer_on(&watchdog_timer, next_cpu);
}
out:
@@ -537,14 +575,6 @@ static inline void clocksource_stop_watchdog(void)
watchdog_running = 0;
}
-static inline void clocksource_reset_watchdog(void)
-{
- struct clocksource *cs;
-
- list_for_each_entry(cs, &watchdog_list, wd_list)
- cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
-}
-
static void clocksource_resume_watchdog(void)
{
atomic_inc(&watchdog_reset_pending);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3ae661ab6260..e8c08292defc 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2089,7 +2089,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
u64 slack;
slack = current->timer_slack_ns;
- if (dl_task(current) || rt_task(current))
+ if (rt_task(current))
slack = 0;
hrtimer_init_sleeper_on_stack(&t, clockid, mode);
@@ -2126,6 +2126,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
if (!timespec64_valid(&tu))
return -EINVAL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
@@ -2147,6 +2148,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
if (!timespec64_valid(&tu))
return -EINVAL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
@@ -2270,7 +2272,7 @@ void __init hrtimers_init(void)
/**
* schedule_hrtimeout_range_clock - sleep until timeout
* @expires: timeout value (ktime_t)
- * @delta: slack in expires timeout (ktime_t)
+ * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks
* @mode: timer mode
* @clock_id: timer clock to be used
*/
@@ -2297,6 +2299,13 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
return -EINTR;
}
+ /*
+ * Override any slack passed by the user if under
+ * rt contraints.
+ */
+ if (rt_task(current))
+ delta = 0;
+
hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
hrtimer_sleeper_start_expires(&t, mode);
@@ -2316,7 +2325,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);
/**
* schedule_hrtimeout_range - sleep until timeout
* @expires: timeout value (ktime_t)
- * @delta: slack in expires timeout (ktime_t)
+ * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks
* @mode: timer mode
*
* Make the current task sleep until the given expiry time has
@@ -2324,7 +2333,8 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);
* the current task state has been set (see set_current_state()).
*
* The @delta argument gives the kernel the freedom to schedule the
- * actual wakeup to a time that is both power and performance friendly.
+ * actual wakeup to a time that is both power and performance friendly
+ * for regular (non RT/DL) tasks.
* The kernel give the normal best effort behavior for "@expires+@delta",
* but may decide to fire the timer earlier, but no earlier than @expires.
*
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index cb925e8ef9a8..2f5e9b34022c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -243,13 +243,12 @@ static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
*/
static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
{
- u64 curr_cputime;
-retry:
- curr_cputime = atomic64_read(cputime);
- if (sum_cputime > curr_cputime) {
- if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
- goto retry;
- }
+ u64 curr_cputime = atomic64_read(cputime);
+
+ do {
+ if (sum_cputime <= curr_cputime)
+ return;
+ } while (!atomic64_try_cmpxchg(cputime, &curr_cputime, sum_cputime));
}
static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic,
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 90ea5f373e50..828aeecbd1e8 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -147,6 +147,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
texp = timespec64_to_ktime(t);
@@ -240,6 +241,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
texp = timespec64_to_ktime(t);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 5dead89308b7..0c8a87a11b39 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1270,6 +1270,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
@@ -1297,6 +1298,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index 13b11eb62685..20d5df631570 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -149,7 +149,7 @@ module_init(udelay_test_init);
static void __exit udelay_test_exit(void)
{
mutex_lock(&udelay_test_lock);
- debugfs_remove(debugfs_lookup(DEBUGFS_FILENAME, NULL));
+ debugfs_lookup_and_remove(DEBUGFS_FILENAME, NULL);
mutex_unlock(&udelay_test_lock);
}
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 797eb93103ad..e28f9210f8a1 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -56,25 +56,20 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
* hrtimer callback function is currently running, then
* hrtimer_start() cannot move it and the timer stays on the CPU on
* which it is assigned at the moment.
+ */
+ hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
+ /*
+ * The core tick broadcast mode expects bc->bound_on to be set
+ * correctly to prevent a CPU which has the broadcast hrtimer
+ * armed from going deep idle.
*
- * As this can be called from idle code, the hrtimer_start()
- * invocation has to be wrapped with RCU_NONIDLE() as
- * hrtimer_start() can call into tracing.
+ * As tick_broadcast_lock is held, nothing can change the cpu
+ * base which was just established in hrtimer_start() above. So
+ * the below access is safe even without holding the hrtimer
+ * base lock.
*/
- RCU_NONIDLE( {
- hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
- /*
- * The core tick broadcast mode expects bc->bound_on to be set
- * correctly to prevent a CPU which has the broadcast hrtimer
- * armed from going deep idle.
- *
- * As tick_broadcast_lock is held, nothing can change the cpu
- * base which was just established in hrtimer_start() above. So
- * the below access is safe even without holding the hrtimer
- * base lock.
- */
- bc->bound_on = bctimer.base->cpu_base->cpu;
- } );
+ bc->bound_on = bctimer.base->cpu_base->cpu;
+
return 0;
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f7fe6fe36173..93bf2b4e47e5 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -622,9 +622,13 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void)
* to avoid a deep idle transition as we are about to get the
* broadcast IPI right away.
*/
-int tick_check_broadcast_expired(void)
+noinstr int tick_check_broadcast_expired(void)
{
+#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
+ return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask));
+#else
return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
+#endif
}
/*
diff --git a/kernel/torture.c b/kernel/torture.c
index 789aeb0e1159..1a0519b836ac 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -450,7 +450,7 @@ unsigned long
torture_random(struct torture_random_state *trsp)
{
if (--trsp->trs_count < 0) {
- trsp->trs_state += (unsigned long)local_clock();
+ trsp->trs_state += (unsigned long)local_clock() + raw_smp_processor_id();
trsp->trs_count = TORTURE_RANDOM_REFRESH;
}
trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT +
@@ -915,7 +915,7 @@ void torture_kthread_stopping(char *title)
VERBOSE_TOROUT_STRING(buf);
while (!kthread_should_stop()) {
torture_shutdown_absorb(title);
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_uninterruptible(HZ / 20);
}
}
EXPORT_SYMBOL_GPL(torture_kthread_stopping);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5f5e64f9e715..a856d4a34c67 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -42,6 +42,9 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS
config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
bool
+config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
+ bool
+
config HAVE_DYNAMIC_FTRACE_WITH_ARGS
bool
help
@@ -257,6 +260,10 @@ config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
depends on DYNAMIC_FTRACE_WITH_REGS
depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+config DYNAMIC_FTRACE_WITH_CALL_OPS
+ def_bool y
+ depends on HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
+
config DYNAMIC_FTRACE_WITH_ARGS
def_bool y
depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 918a7d12df8f..d5d94510afd3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -320,8 +320,8 @@ static void blk_trace_free(struct request_queue *q, struct blk_trace *bt)
* under 'q->debugfs_dir', thus lookup and remove them.
*/
if (!bt->dir) {
- debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir));
- debugfs_remove(debugfs_lookup("msg", q->debugfs_dir));
+ debugfs_lookup_and_remove("dropped", q->debugfs_dir);
+ debugfs_lookup_and_remove("msg", q->debugfs_dir);
} else {
debugfs_remove(bt->dir);
}
@@ -729,14 +729,10 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop);
**/
int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
{
- struct request_queue *q;
+ struct request_queue *q = bdev_get_queue(bdev);
int ret, start = 0;
char b[BDEVNAME_SIZE];
- q = bdev_get_queue(bdev);
- if (!q)
- return -ENXIO;
-
mutex_lock(&q->debugfs_mutex);
switch (cmd) {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f47274de012b..e8da032bb6fc 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -369,8 +369,6 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
return &bpf_probe_write_user_proto;
}
-static DEFINE_RAW_SPINLOCK(trace_printk_lock);
-
#define MAX_TRACE_PRINTK_VARARGS 3
#define BPF_TRACE_PRINTK_SIZE 1024
@@ -378,23 +376,22 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
u64, arg2, u64, arg3)
{
u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 };
- u32 *bin_args;
- static char buf[BPF_TRACE_PRINTK_SIZE];
- unsigned long flags;
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ .get_buf = true,
+ };
int ret;
- ret = bpf_bprintf_prepare(fmt, fmt_size, args, &bin_args,
- MAX_TRACE_PRINTK_VARARGS);
+ ret = bpf_bprintf_prepare(fmt, fmt_size, args,
+ MAX_TRACE_PRINTK_VARARGS, &data);
if (ret < 0)
return ret;
- raw_spin_lock_irqsave(&trace_printk_lock, flags);
- ret = bstr_printf(buf, sizeof(buf), fmt, bin_args);
+ ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args);
- trace_bpf_trace_printk(buf);
- raw_spin_unlock_irqrestore(&trace_printk_lock, flags);
+ trace_bpf_trace_printk(data.buf);
- bpf_bprintf_cleanup();
+ bpf_bprintf_cleanup(&data);
return ret;
}
@@ -427,30 +424,29 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
-BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, data,
+BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, args,
u32, data_len)
{
- static char buf[BPF_TRACE_PRINTK_SIZE];
- unsigned long flags;
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ .get_buf = true,
+ };
int ret, num_args;
- u32 *bin_args;
if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
- (data_len && !data))
+ (data_len && !args))
return -EINVAL;
num_args = data_len / 8;
- ret = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args);
+ ret = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data);
if (ret < 0)
return ret;
- raw_spin_lock_irqsave(&trace_printk_lock, flags);
- ret = bstr_printf(buf, sizeof(buf), fmt, bin_args);
+ ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args);
- trace_bpf_trace_printk(buf);
- raw_spin_unlock_irqrestore(&trace_printk_lock, flags);
+ trace_bpf_trace_printk(data.buf);
- bpf_bprintf_cleanup();
+ bpf_bprintf_cleanup(&data);
return ret;
}
@@ -472,23 +468,25 @@ const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)
}
BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
- const void *, data, u32, data_len)
+ const void *, args, u32, data_len)
{
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ };
int err, num_args;
- u32 *bin_args;
if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
- (data_len && !data))
+ (data_len && !args))
return -EINVAL;
num_args = data_len / 8;
- err = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args);
+ err = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data);
if (err < 0)
return err;
- seq_bprintf(m, fmt, bin_args);
+ seq_bprintf(m, fmt, data.bin_args);
- bpf_bprintf_cleanup();
+ bpf_bprintf_cleanup(&data);
return seq_has_overflowed(m) ? -EOVERFLOW : 0;
}
@@ -687,8 +685,7 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
}
perf_sample_data_init(sd, 0, 0);
- sd->raw = &raw;
- sd->sample_flags |= PERF_SAMPLE_RAW;
+ perf_sample_save_raw_data(sd, &raw);
err = __bpf_perf_event_output(regs, map, flags, sd);
@@ -746,8 +743,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
perf_fetch_caller_regs(regs);
perf_sample_data_init(sd, 0, 0);
- sd->raw = &raw;
- sd->sample_flags |= PERF_SAMPLE_RAW;
+ perf_sample_save_raw_data(sd, &raw);
ret = __bpf_perf_event_output(regs, map, flags, sd);
out:
@@ -833,6 +829,7 @@ static void do_bpf_send_signal(struct irq_work *entry)
work = container_of(entry, struct send_signal_irq_work, irq_work);
group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
+ put_task_struct(work->task);
}
static int bpf_send_signal_common(u32 sig, enum pid_type type)
@@ -867,7 +864,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
* to the irq_work. The current task may change when queued
* irq works get executed.
*/
- work->task = current;
+ work->task = get_task_struct(current);
work->sig = sig;
work->type = type;
irq_work_queue(&work->irq_work);
@@ -1238,7 +1235,7 @@ __diag_ignore_all("-Wmissing-prototypes",
* Return: a bpf_key pointer with a valid key pointer if the key is found, a
* NULL pointer otherwise.
*/
-struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
+__bpf_kfunc struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
{
key_ref_t key_ref;
struct bpf_key *bkey;
@@ -1287,7 +1284,7 @@ struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
* Return: a bpf_key pointer with an invalid key pointer set from the
* pre-determined ID on success, a NULL pointer otherwise
*/
-struct bpf_key *bpf_lookup_system_key(u64 id)
+__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
{
struct bpf_key *bkey;
@@ -1311,7 +1308,7 @@ struct bpf_key *bpf_lookup_system_key(u64 id)
* Decrement the reference count of the key inside *bkey*, if the pointer
* is valid, and free *bkey*.
*/
-void bpf_key_put(struct bpf_key *bkey)
+__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
{
if (bkey->has_ref)
key_put(bkey->key);
@@ -1331,7 +1328,7 @@ void bpf_key_put(struct bpf_key *bkey)
*
* Return: 0 on success, a negative value on error.
*/
-int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
+__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
struct bpf_dynptr_kern *sig_ptr,
struct bpf_key *trusted_keyring)
{
@@ -2687,69 +2684,77 @@ static void symbols_swap_r(void *a, void *b, int size, const void *priv)
}
}
-struct module_addr_args {
- unsigned long *addrs;
- u32 addrs_cnt;
+struct modules_array {
struct module **mods;
int mods_cnt;
int mods_cap;
};
-static int module_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
+static int add_module(struct modules_array *arr, struct module *mod)
{
- struct module_addr_args *args = data;
struct module **mods;
- /* We iterate all modules symbols and for each we:
- * - search for it in provided addresses array
- * - if found we check if we already have the module pointer stored
- * (we iterate modules sequentially, so we can check just the last
- * module pointer)
- * - take module reference and store it
- */
- if (!bsearch(&addr, args->addrs, args->addrs_cnt, sizeof(addr),
- bpf_kprobe_multi_addrs_cmp))
- return 0;
-
- if (args->mods && args->mods[args->mods_cnt - 1] == mod)
- return 0;
-
- if (args->mods_cnt == args->mods_cap) {
- args->mods_cap = max(16, args->mods_cap * 3 / 2);
- mods = krealloc_array(args->mods, args->mods_cap, sizeof(*mods), GFP_KERNEL);
+ if (arr->mods_cnt == arr->mods_cap) {
+ arr->mods_cap = max(16, arr->mods_cap * 3 / 2);
+ mods = krealloc_array(arr->mods, arr->mods_cap, sizeof(*mods), GFP_KERNEL);
if (!mods)
return -ENOMEM;
- args->mods = mods;
+ arr->mods = mods;
}
- if (!try_module_get(mod))
- return -EINVAL;
-
- args->mods[args->mods_cnt] = mod;
- args->mods_cnt++;
+ arr->mods[arr->mods_cnt] = mod;
+ arr->mods_cnt++;
return 0;
}
+static bool has_module(struct modules_array *arr, struct module *mod)
+{
+ int i;
+
+ for (i = arr->mods_cnt - 1; i >= 0; i--) {
+ if (arr->mods[i] == mod)
+ return true;
+ }
+ return false;
+}
+
static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u32 addrs_cnt)
{
- struct module_addr_args args = {
- .addrs = addrs,
- .addrs_cnt = addrs_cnt,
- };
- int err;
+ struct modules_array arr = {};
+ u32 i, err = 0;
+
+ for (i = 0; i < addrs_cnt; i++) {
+ struct module *mod;
+
+ preempt_disable();
+ mod = __module_address(addrs[i]);
+ /* Either no module or we it's already stored */
+ if (!mod || has_module(&arr, mod)) {
+ preempt_enable();
+ continue;
+ }
+ if (!try_module_get(mod))
+ err = -EINVAL;
+ preempt_enable();
+ if (err)
+ break;
+ err = add_module(&arr, mod);
+ if (err) {
+ module_put(mod);
+ break;
+ }
+ }
/* We return either err < 0 in case of error, ... */
- err = module_kallsyms_on_each_symbol(module_callback, &args);
if (err) {
- kprobe_multi_put_modules(args.mods, args.mods_cnt);
- kfree(args.mods);
+ kprobe_multi_put_modules(arr.mods, arr.mods_cnt);
+ kfree(arr.mods);
return err;
}
/* or number of modules found if everything is ok. */
- *mods = args.mods;
- return args.mods_cnt;
+ *mods = arr.mods;
+ return arr.mods_cnt;
}
int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
@@ -2862,13 +2867,6 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
bpf_kprobe_multi_cookie_cmp,
bpf_kprobe_multi_cookie_swap,
link);
- } else {
- /*
- * We need to sort addrs array even if there are no cookies
- * provided, to allow bsearch in get_modules_for_addrs.
- */
- sort(addrs, cnt, sizeof(*addrs),
- bpf_kprobe_multi_addrs_cmp, NULL);
}
err = get_modules_for_addrs(&link->mods, addrs, cnt);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ec2897a76004..0feea145bb29 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -125,6 +125,33 @@ struct ftrace_ops global_ops;
void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs);
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+/*
+ * Stub used to invoke the list ops without requiring a separate trampoline.
+ */
+const struct ftrace_ops ftrace_list_ops = {
+ .func = ftrace_ops_list_func,
+ .flags = FTRACE_OPS_FL_STUB,
+};
+
+static void ftrace_ops_nop_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op,
+ struct ftrace_regs *fregs)
+{
+ /* do nothing */
+}
+
+/*
+ * Stub used when a call site is disabled. May be called transiently by threads
+ * which have made it into ftrace_caller but haven't yet recovered the ops at
+ * the point the call site is disabled.
+ */
+const struct ftrace_ops ftrace_nop_ops = {
+ .func = ftrace_ops_nop_func,
+ .flags = FTRACE_OPS_FL_STUB,
+};
+#endif
+
static inline void ftrace_ops_init(struct ftrace_ops *ops)
{
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -1820,6 +1847,18 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
* if rec count is zero.
*/
}
+
+ /*
+ * If the rec has a single associated ops, and ops->func can be
+ * called directly, allow the call site to call via the ops.
+ */
+ if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS) &&
+ ftrace_rec_count(rec) == 1 &&
+ ftrace_ops_get_func(ops) == ops->func)
+ rec->flags |= FTRACE_FL_CALL_OPS;
+ else
+ rec->flags &= ~FTRACE_FL_CALL_OPS;
+
count++;
/* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */
@@ -2114,8 +2153,9 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
struct ftrace_ops *ops = NULL;
pr_info("ftrace record flags: %lx\n", rec->flags);
- pr_cont(" (%ld)%s", ftrace_rec_count(rec),
- rec->flags & FTRACE_FL_REGS ? " R" : " ");
+ pr_cont(" (%ld)%s%s", ftrace_rec_count(rec),
+ rec->flags & FTRACE_FL_REGS ? " R" : " ",
+ rec->flags & FTRACE_FL_CALL_OPS ? " O" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
if (ops) {
@@ -2183,6 +2223,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
* want the direct enabled (it will be done via the
* direct helper). But if DIRECT_EN is set, and
* the count is not one, we need to clear it.
+ *
*/
if (ftrace_rec_count(rec) == 1) {
if (!(rec->flags & FTRACE_FL_DIRECT) !=
@@ -2191,6 +2232,19 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
} else if (rec->flags & FTRACE_FL_DIRECT_EN) {
flag |= FTRACE_FL_DIRECT;
}
+
+ /*
+ * Ops calls are special, as count matters.
+ * As with direct calls, they must only be enabled when count
+ * is one, otherwise they'll be handled via the list ops.
+ */
+ if (ftrace_rec_count(rec) == 1) {
+ if (!(rec->flags & FTRACE_FL_CALL_OPS) !=
+ !(rec->flags & FTRACE_FL_CALL_OPS_EN))
+ flag |= FTRACE_FL_CALL_OPS;
+ } else if (rec->flags & FTRACE_FL_CALL_OPS_EN) {
+ flag |= FTRACE_FL_CALL_OPS;
+ }
}
/* If the state of this record hasn't changed, then do nothing */
@@ -2235,6 +2289,21 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
rec->flags &= ~FTRACE_FL_DIRECT_EN;
}
}
+
+ if (flag & FTRACE_FL_CALL_OPS) {
+ if (ftrace_rec_count(rec) == 1) {
+ if (rec->flags & FTRACE_FL_CALL_OPS)
+ rec->flags |= FTRACE_FL_CALL_OPS_EN;
+ else
+ rec->flags &= ~FTRACE_FL_CALL_OPS_EN;
+ } else {
+ /*
+ * Can only call directly if there's
+ * only one set of associated ops.
+ */
+ rec->flags &= ~FTRACE_FL_CALL_OPS_EN;
+ }
+ }
}
/*
@@ -2264,7 +2333,8 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
* and REGS states. The _EN flags must be disabled though.
*/
rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN |
- FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN);
+ FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN |
+ FTRACE_FL_CALL_OPS_EN);
}
ftrace_bug_type = FTRACE_BUG_NOP;
@@ -2437,6 +2507,25 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
return NULL;
}
+struct ftrace_ops *
+ftrace_find_unique_ops(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op, *found = NULL;
+ unsigned long ip = rec->ip;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+
+ if (hash_contains_ip(ip, op->func_hash)) {
+ if (found)
+ return NULL;
+ found = op;
+ }
+
+ } while_for_each_ftrace_op(op);
+
+ return found;
+}
+
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
/* Protected by rcu_tasks for reading, and direct_mutex for writing */
static struct ftrace_hash *direct_functions = EMPTY_HASH;
@@ -3786,11 +3875,12 @@ static int t_show(struct seq_file *m, void *v)
if (iter->flags & FTRACE_ITER_ENABLED) {
struct ftrace_ops *ops;
- seq_printf(m, " (%ld)%s%s%s",
+ seq_printf(m, " (%ld)%s%s%s%s",
ftrace_rec_count(rec),
rec->flags & FTRACE_FL_REGS ? " R" : " ",
rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ",
- rec->flags & FTRACE_FL_DIRECT ? " D" : " ");
+ rec->flags & FTRACE_FL_DIRECT ? " D" : " ",
+ rec->flags & FTRACE_FL_CALL_OPS ? " O" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
if (ops) {
@@ -3806,6 +3896,15 @@ static int t_show(struct seq_file *m, void *v)
} else {
add_trampoline_func(m, NULL, rec);
}
+ if (rec->flags & FTRACE_FL_CALL_OPS_EN) {
+ ops = ftrace_find_unique_ops(rec);
+ if (ops) {
+ seq_printf(m, "\tops: %pS (%pS)",
+ ops, ops->func);
+ } else {
+ seq_puts(m, "\tops: ERROR!");
+ }
+ }
if (rec->flags & FTRACE_FL_DIRECT) {
unsigned long direct;
@@ -8346,7 +8445,7 @@ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *a
found_all = kallsyms_on_each_symbol(kallsyms_callback, &args);
if (found_all)
return 0;
- found_all = module_kallsyms_on_each_symbol(kallsyms_callback, &args);
+ found_all = module_kallsyms_on_each_symbol(NULL, kallsyms_callback, &args);
return found_all ? 0 : -ESRCH;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3c7cd135333f..c6f47b6cfd5f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1577,19 +1577,6 @@ static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
}
/**
- * rb_check_list - make sure a pointer to a list has the last bits zero
- */
-static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
- struct list_head *list)
-{
- if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
- return 1;
- if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
- return 1;
- return 0;
-}
-
-/**
* rb_check_pages - integrity check of buffer pages
* @cpu_buffer: CPU buffer with pages to test
*
@@ -1598,36 +1585,27 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
*/
static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct list_head *head = cpu_buffer->pages;
- struct buffer_page *bpage, *tmp;
+ struct list_head *head = rb_list_head(cpu_buffer->pages);
+ struct list_head *tmp;
- /* Reset the head page if it exists */
- if (cpu_buffer->head_page)
- rb_set_head_page(cpu_buffer);
-
- rb_head_page_deactivate(cpu_buffer);
-
- if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
- return -1;
- if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
+ if (RB_WARN_ON(cpu_buffer,
+ rb_list_head(rb_list_head(head->next)->prev) != head))
return -1;
- if (rb_check_list(cpu_buffer, head))
+ if (RB_WARN_ON(cpu_buffer,
+ rb_list_head(rb_list_head(head->prev)->next) != head))
return -1;
- list_for_each_entry_safe(bpage, tmp, head, list) {
+ for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) {
if (RB_WARN_ON(cpu_buffer,
- bpage->list.next->prev != &bpage->list))
+ rb_list_head(rb_list_head(tmp->next)->prev) != tmp))
return -1;
+
if (RB_WARN_ON(cpu_buffer,
- bpage->list.prev->next != &bpage->list))
- return -1;
- if (rb_check_list(cpu_buffer, &bpage->list))
+ rb_list_head(rb_list_head(tmp->prev)->next) != tmp))
return -1;
}
- rb_head_page_activate(cpu_buffer);
-
return 0;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4e9a7a952025..937e9676dfd4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3173,6 +3173,9 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
return;
}
+ if (WARN_ON_ONCE(IS_ENABLED(CONFIG_GENERIC_ENTRY)))
+ return;
+
/*
* When an NMI triggers, RCU is enabled via ct_nmi_enter(),
* but if the above rcu_is_watching() failed, then the NMI
@@ -5645,7 +5648,7 @@ static const char readme_msg[] =
#ifdef CONFIG_HIST_TRIGGERS
"\t s:[synthetic/]<event> <field> [<field>]\n"
#endif
- "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>]\n"
+ "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>] [if <filter>]\n"
"\t -:[<group>/][<event>]\n"
#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
@@ -5662,7 +5665,7 @@ static const char readme_msg[] =
"\t $stack<index>, $stack, $retval, $comm,\n"
#endif
"\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
- "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n"
+ "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n"
"\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
"\t symstr, <type>\\[<array-size>\\]\n"
#ifdef CONFIG_HIST_TRIGGERS
@@ -9195,9 +9198,6 @@ buffer_percent_write(struct file *filp, const char __user *ubuf,
if (val > 100)
return -EINVAL;
- if (!val)
- val = 1;
-
tr->buffer_percent = val;
(*ppos)++;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f10bf804dd2b..616e1aa1c4da 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -25,7 +25,7 @@
#include "pid_list.h"
#ifdef CONFIG_FTRACE_SYSCALLS
-#include <asm/unistd.h> /* For NR_SYSCALLS */
+#include <asm/unistd.h> /* For NR_syscalls */
#include <asm/syscall.h> /* some archs define it here */
#endif
@@ -1286,6 +1286,7 @@ struct ftrace_event_field {
int offset;
int size;
int is_signed;
+ int len;
};
struct prog_entry;
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 352b65e2b910..67e854979d53 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -311,7 +311,7 @@ print_eprobe_event(struct trace_iterator *iter, int flags,
trace_seq_putc(s, ')');
- if (print_probe_args(s, tp->args, tp->nr_args,
+ if (trace_probe_print_args(s, tp->args, tp->nr_args,
(u8 *)&field[1], field) < 0)
goto out;
@@ -320,7 +320,8 @@ print_eprobe_event(struct trace_iterator *iter, int flags,
return trace_handle_return(s);
}
-static unsigned long get_event_field(struct fetch_insn *code, void *rec)
+static nokprobe_inline unsigned long
+get_event_field(struct fetch_insn *code, void *rec)
{
struct ftrace_event_field *field = code->data;
unsigned long val;
@@ -395,20 +396,12 @@ static int get_eprobe_size(struct trace_probe *tp, void *rec)
case FETCH_OP_TP_ARG:
val = get_event_field(code, rec);
break;
- case FETCH_OP_IMM:
- val = code->immediate;
- break;
- case FETCH_OP_COMM:
- val = (unsigned long)current->comm;
- break;
- case FETCH_OP_DATA:
- val = (unsigned long)code->data;
- break;
case FETCH_NOP_SYMBOL: /* Ignore a place holder */
code++;
goto retry;
default:
- continue;
+ if (process_common_fetch_insn(code, &val) < 0)
+ continue;
}
code++;
len = process_fetch_insn_bottom(code, val, NULL, NULL);
@@ -428,84 +421,26 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
void *base)
{
unsigned long val;
+ int ret;
retry:
switch (code->op) {
case FETCH_OP_TP_ARG:
val = get_event_field(code, rec);
break;
- case FETCH_OP_IMM:
- val = code->immediate;
- break;
- case FETCH_OP_COMM:
- val = (unsigned long)current->comm;
- break;
- case FETCH_OP_DATA:
- val = (unsigned long)code->data;
- break;
case FETCH_NOP_SYMBOL: /* Ignore a place holder */
code++;
goto retry;
default:
- return -EILSEQ;
+ ret = process_common_fetch_insn(code, &val);
+ if (ret < 0)
+ return ret;
}
code++;
return process_fetch_insn_bottom(code, val, dest, base);
}
NOKPROBE_SYMBOL(process_fetch_insn)
-/* Return the length of string -- including null terminal byte */
-static nokprobe_inline int
-fetch_store_strlen_user(unsigned long addr)
-{
- return kern_fetch_store_strlen_user(addr);
-}
-
-/* Return the length of string -- including null terminal byte */
-static nokprobe_inline int
-fetch_store_strlen(unsigned long addr)
-{
- return kern_fetch_store_strlen(addr);
-}
-
-/*
- * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
- * with max length and relative data location.
- */
-static nokprobe_inline int
-fetch_store_string_user(unsigned long addr, void *dest, void *base)
-{
- return kern_fetch_store_string_user(addr, dest, base);
-}
-
-/*
- * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
- * length and relative data location.
- */
-static nokprobe_inline int
-fetch_store_string(unsigned long addr, void *dest, void *base)
-{
- return kern_fetch_store_string(addr, dest, base);
-}
-
-static nokprobe_inline int
-probe_mem_read_user(void *dest, void *src, size_t size)
-{
- const void __user *uaddr = (__force const void __user *)src;
-
- return copy_from_user_nofault(dest, uaddr, size);
-}
-
-static nokprobe_inline int
-probe_mem_read(void *dest, void *src, size_t size)
-{
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if ((unsigned long)src < TASK_SIZE)
- return probe_mem_read_user(dest, src, size);
-#endif
- return copy_from_kernel_nofault(dest, src, size);
-}
-
/* eprobe handler */
static inline void
__eprobe_trace_func(struct eprobe_data *edata, void *rec)
@@ -923,17 +858,13 @@ static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const ch
p = ep->filter_str;
for (i = 0; i < argc; i++) {
- ret = snprintf(p, len, "%s ", argv[i]);
- if (ret < 0)
- goto error;
- if (ret > len) {
- ret = -E2BIG;
- goto error;
- }
+ if (i)
+ ret = snprintf(p, len, " %s", argv[i]);
+ else
+ ret = snprintf(p, len, "%s", argv[i]);
p += ret;
len -= ret;
}
- p[-1] = '\0';
/*
* Ensure the filter string can be parsed correctly. Note, this
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b52c4d79eb78..654ffa40457a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -114,7 +114,7 @@ trace_find_event_field(struct trace_event_call *call, char *name)
static int __trace_define_field(struct list_head *head, const char *type,
const char *name, int offset, int size,
- int is_signed, int filter_type)
+ int is_signed, int filter_type, int len)
{
struct ftrace_event_field *field;
@@ -133,6 +133,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
field->offset = offset;
field->size = size;
field->is_signed = is_signed;
+ field->len = len;
list_add(&field->link, head);
@@ -150,14 +151,28 @@ int trace_define_field(struct trace_event_call *call, const char *type,
head = trace_get_fields(call);
return __trace_define_field(head, type, name, offset, size,
- is_signed, filter_type);
+ is_signed, filter_type, 0);
}
EXPORT_SYMBOL_GPL(trace_define_field);
+static int trace_define_field_ext(struct trace_event_call *call, const char *type,
+ const char *name, int offset, int size, int is_signed,
+ int filter_type, int len)
+{
+ struct list_head *head;
+
+ if (WARN_ON(!call->class))
+ return 0;
+
+ head = trace_get_fields(call);
+ return __trace_define_field(head, type, name, offset, size,
+ is_signed, filter_type, len);
+}
+
#define __generic_field(type, item, filter_type) \
ret = __trace_define_field(&ftrace_generic_fields, #type, \
#item, 0, 0, is_signed_type(type), \
- filter_type); \
+ filter_type, 0); \
if (ret) \
return ret;
@@ -166,7 +181,7 @@ EXPORT_SYMBOL_GPL(trace_define_field);
"common_" #item, \
offsetof(typeof(ent), item), \
sizeof(ent.item), \
- is_signed_type(type), FILTER_OTHER); \
+ is_signed_type(type), FILTER_OTHER, 0); \
if (ret) \
return ret;
@@ -1588,12 +1603,17 @@ static int f_show(struct seq_file *m, void *v)
seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
field->type, field->name, field->offset,
field->size, !!field->is_signed);
- else
- seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ else if (field->len)
+ seq_printf(m, "\tfield:%.*s %s[%d];\toffset:%u;\tsize:%u;\tsigned:%d;\n",
(int)(array_descriptor - field->type),
field->type, field->name,
- array_descriptor, field->offset,
+ field->len, field->offset,
field->size, !!field->is_signed);
+ else
+ seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ (int)(array_descriptor - field->type),
+ field->type, field->name,
+ field->offset, field->size, !!field->is_signed);
return 0;
}
@@ -2377,9 +2397,10 @@ event_define_fields(struct trace_event_call *call)
}
offset = ALIGN(offset, field->align);
- ret = trace_define_field(call, field->type, field->name,
+ ret = trace_define_field_ext(call, field->type, field->name,
offset, field->size,
- field->is_signed, field->filter_type);
+ field->is_signed, field->filter_type,
+ field->len);
if (WARN_ON_ONCE(ret)) {
pr_err("error code is %d\n", ret);
break;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 70bddb25d9c0..46d0abb32d0f 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -448,12 +448,12 @@ static unsigned int trace_string(struct synth_trace_event *entry,
data_offset = struct_size(entry, fields, event->n_u64);
data_offset += data_size;
- len = kern_fetch_store_strlen((unsigned long)str_val);
+ len = fetch_store_strlen((unsigned long)str_val);
data_offset |= len << 16;
*(u32 *)&entry->fields[*n_u64] = data_offset;
- ret = kern_fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);
+ ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);
(*n_u64)++;
} else {
@@ -542,7 +542,7 @@ static notrace void trace_event_raw_event_synth(void *__data,
len = *((unsigned long *)str_val);
len *= sizeof(unsigned long);
} else {
- len = kern_fetch_store_strlen((unsigned long)str_val);
+ len = fetch_store_strlen((unsigned long)str_val);
}
fields_size += len;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d960f6b11b5e..58f3946081e2 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -111,7 +111,8 @@ static void __always_unused ____ftrace_check_##name(void) \
#define __array(_type, _item, _len) { \
.type = #_type"["__stringify(_len)"]", .name = #_item, \
.size = sizeof(_type[_len]), .align = __alignof__(_type), \
- is_signed_type(_type), .filter_type = FILTER_OTHER },
+ is_signed_type(_type), .filter_type = FILTER_OTHER, \
+ .len = _len },
#undef __array_desc
#define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ee77c8203bd5..59cda19a9033 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1218,60 +1218,6 @@ static const struct file_operations kprobe_profile_ops = {
.release = seq_release,
};
-/* Kprobe specific fetch functions */
-
-/* Return the length of string -- including null terminal byte */
-static nokprobe_inline int
-fetch_store_strlen_user(unsigned long addr)
-{
- return kern_fetch_store_strlen_user(addr);
-}
-
-/* Return the length of string -- including null terminal byte */
-static nokprobe_inline int
-fetch_store_strlen(unsigned long addr)
-{
- return kern_fetch_store_strlen(addr);
-}
-
-/*
- * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
- * with max length and relative data location.
- */
-static nokprobe_inline int
-fetch_store_string_user(unsigned long addr, void *dest, void *base)
-{
- return kern_fetch_store_string_user(addr, dest, base);
-}
-
-/*
- * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
- * length and relative data location.
- */
-static nokprobe_inline int
-fetch_store_string(unsigned long addr, void *dest, void *base)
-{
- return kern_fetch_store_string(addr, dest, base);
-}
-
-static nokprobe_inline int
-probe_mem_read_user(void *dest, void *src, size_t size)
-{
- const void __user *uaddr = (__force const void __user *)src;
-
- return copy_from_user_nofault(dest, uaddr, size);
-}
-
-static nokprobe_inline int
-probe_mem_read(void *dest, void *src, size_t size)
-{
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if ((unsigned long)src < TASK_SIZE)
- return probe_mem_read_user(dest, src, size);
-#endif
- return copy_from_kernel_nofault(dest, src, size);
-}
-
/* Note that we don't verify it, since the code does not come from user space */
static int
process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
@@ -1279,6 +1225,7 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
{
struct pt_regs *regs = rec;
unsigned long val;
+ int ret;
retry:
/* 1st stage: get value from context */
@@ -1295,15 +1242,6 @@ retry:
case FETCH_OP_RETVAL:
val = regs_return_value(regs);
break;
- case FETCH_OP_IMM:
- val = code->immediate;
- break;
- case FETCH_OP_COMM:
- val = (unsigned long)current->comm;
- break;
- case FETCH_OP_DATA:
- val = (unsigned long)code->data;
- break;
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
case FETCH_OP_ARG:
val = regs_get_kernel_argument(regs, code->param);
@@ -1313,7 +1251,9 @@ retry:
code++;
goto retry;
default:
- return -EILSEQ;
+ ret = process_common_fetch_insn(code, &val);
+ if (ret < 0)
+ return ret;
}
code++;
@@ -1424,7 +1364,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
trace_seq_putc(s, ')');
- if (print_probe_args(s, tp->args, tp->nr_args,
+ if (trace_probe_print_args(s, tp->args, tp->nr_args,
(u8 *)&field[1], field) < 0)
goto out;
@@ -1459,7 +1399,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
trace_seq_putc(s, ')');
- if (print_probe_args(s, tp->args, tp->nr_args,
+ if (trace_probe_print_args(s, tp->args, tp->nr_args,
(u8 *)&field[1], field) < 0)
goto out;
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index 1e130da1b742..e37446f7916e 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -15,6 +15,20 @@
#define CREATE_TRACE_POINTS
#include <trace/events/preemptirq.h>
+/*
+ * Use regular trace points on architectures that implement noinstr
+ * tooling: these calls will only happen with RCU enabled, which can
+ * use a regular tracepoint.
+ *
+ * On older architectures, use the rcuidle tracing methods (which
+ * aren't NMI-safe - so exclude NMI contexts):
+ */
+#ifdef CONFIG_ARCH_WANTS_NO_INSTR
+#define trace(point) trace_##point
+#else
+#define trace(point) if (!in_nmi()) trace_##point##_rcuidle
+#endif
+
#ifdef CONFIG_TRACE_IRQFLAGS
/* Per-cpu variable to prevent redundant calls when IRQs already off */
static DEFINE_PER_CPU(int, tracing_irq_cpu);
@@ -28,8 +42,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
void trace_hardirqs_on_prepare(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
- if (!in_nmi())
- trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
this_cpu_write(tracing_irq_cpu, 0);
}
@@ -40,8 +53,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);
void trace_hardirqs_on(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
- if (!in_nmi())
- trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
this_cpu_write(tracing_irq_cpu, 0);
}
@@ -63,8 +75,7 @@ void trace_hardirqs_off_finish(void)
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
- if (!in_nmi())
- trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
}
}
@@ -78,56 +89,24 @@ void trace_hardirqs_off(void)
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
- if (!in_nmi())
- trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
}
}
EXPORT_SYMBOL(trace_hardirqs_off);
NOKPROBE_SYMBOL(trace_hardirqs_off);
-
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
-{
- if (this_cpu_read(tracing_irq_cpu)) {
- if (!in_nmi())
- trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
- tracer_hardirqs_on(CALLER_ADDR0, caller_addr);
- this_cpu_write(tracing_irq_cpu, 0);
- }
-
- lockdep_hardirqs_on_prepare();
- lockdep_hardirqs_on(caller_addr);
-}
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
-NOKPROBE_SYMBOL(trace_hardirqs_on_caller);
-
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
-{
- lockdep_hardirqs_off(caller_addr);
-
- if (!this_cpu_read(tracing_irq_cpu)) {
- this_cpu_write(tracing_irq_cpu, 1);
- tracer_hardirqs_off(CALLER_ADDR0, caller_addr);
- if (!in_nmi())
- trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
- }
-}
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
-NOKPROBE_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_TRACE_IRQFLAGS */
#ifdef CONFIG_TRACE_PREEMPT_TOGGLE
void trace_preempt_on(unsigned long a0, unsigned long a1)
{
- if (!in_nmi())
- trace_preempt_enable_rcuidle(a0, a1);
+ trace(preempt_enable)(a0, a1);
tracer_preempt_on(a0, a1);
}
void trace_preempt_off(unsigned long a0, unsigned long a1)
{
- if (!in_nmi())
- trace_preempt_disable_rcuidle(a0, a1);
+ trace(preempt_disable)(a0, a1);
tracer_preempt_off(a0, a1);
}
#endif
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 01ebabbbe8c9..20d0c4a97633 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -50,6 +50,7 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(x8, u8, "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx")
+DEFINE_BASIC_PRINT_TYPE_FUNC(char, u8, "'%c'")
int PRINT_TYPE_FUNC_NAME(symbol)(struct trace_seq *s, void *data, void *ent)
{
@@ -95,6 +96,7 @@ static const struct fetch_type probe_fetch_types[] = {
ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(char, u8, u8, 0),
ASSIGN_FETCH_TYPE_ALIAS(symbol, ADDR_FETCH_TYPE, ADDR_FETCH_TYPE, 0),
ASSIGN_FETCH_TYPE_END
@@ -1237,3 +1239,30 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char
return ret;
}
+
+int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args,
+ u8 *data, void *field)
+{
+ void *p;
+ int i, j;
+
+ for (i = 0; i < nr_args; i++) {
+ struct probe_arg *a = args + i;
+
+ trace_seq_printf(s, " %s=", a->name);
+ if (likely(!a->count)) {
+ if (!a->type->print(s, data + a->offset, field))
+ return -ENOMEM;
+ continue;
+ }
+ trace_seq_putc(s, '{');
+ p = data + a->offset;
+ for (j = 0; j < a->count; j++) {
+ if (!a->type->print(s, p, field))
+ return -ENOMEM;
+ trace_seq_putc(s, j == a->count - 1 ? '}' : ',');
+ p += a->type->size;
+ }
+ }
+ return 0;
+}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 23acfd1c3812..ef8ed3b65d05 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -166,6 +166,7 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(x16);
DECLARE_BASIC_PRINT_TYPE_FUNC(x32);
DECLARE_BASIC_PRINT_TYPE_FUNC(x64);
+DECLARE_BASIC_PRINT_TYPE_FUNC(char);
DECLARE_BASIC_PRINT_TYPE_FUNC(string);
DECLARE_BASIC_PRINT_TYPE_FUNC(symbol);
@@ -348,6 +349,8 @@ int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b);
bool trace_probe_match_command_args(struct trace_probe *tp,
int argc, const char **argv);
int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **));
+int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args,
+ u8 *data, void *field);
#define trace_probe_for_each_link(pos, tp) \
list_for_each_entry(pos, &(tp)->event->files, list)
diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h
index 77dbd9ff9782..c4e1d4c03a85 100644
--- a/kernel/trace/trace_probe_kernel.h
+++ b/kernel/trace/trace_probe_kernel.h
@@ -12,7 +12,7 @@
*/
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
-kern_fetch_store_strlen_user(unsigned long addr)
+fetch_store_strlen_user(unsigned long addr)
{
const void __user *uaddr = (__force const void __user *)addr;
int ret;
@@ -29,14 +29,14 @@ kern_fetch_store_strlen_user(unsigned long addr)
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
-kern_fetch_store_strlen(unsigned long addr)
+fetch_store_strlen(unsigned long addr)
{
int ret, len = 0;
u8 c;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if (addr < TASK_SIZE)
- return kern_fetch_store_strlen_user(addr);
+ return fetch_store_strlen_user(addr);
#endif
do {
@@ -63,7 +63,7 @@ static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void
* with max length and relative data location.
*/
static nokprobe_inline int
-kern_fetch_store_string_user(unsigned long addr, void *dest, void *base)
+fetch_store_string_user(unsigned long addr, void *dest, void *base)
{
const void __user *uaddr = (__force const void __user *)addr;
int maxlen = get_loc_len(*(u32 *)dest);
@@ -86,7 +86,7 @@ kern_fetch_store_string_user(unsigned long addr, void *dest, void *base)
* length and relative data location.
*/
static nokprobe_inline int
-kern_fetch_store_string(unsigned long addr, void *dest, void *base)
+fetch_store_string(unsigned long addr, void *dest, void *base)
{
int maxlen = get_loc_len(*(u32 *)dest);
void *__dest;
@@ -94,7 +94,7 @@ kern_fetch_store_string(unsigned long addr, void *dest, void *base)
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if ((unsigned long)addr < TASK_SIZE)
- return kern_fetch_store_string_user(addr, dest, base);
+ return fetch_store_string_user(addr, dest, base);
#endif
if (unlikely(!maxlen))
@@ -112,4 +112,22 @@ kern_fetch_store_string(unsigned long addr, void *dest, void *base)
return ret;
}
+static nokprobe_inline int
+probe_mem_read_user(void *dest, void *src, size_t size)
+{
+ const void __user *uaddr = (__force const void __user *)src;
+
+ return copy_from_user_nofault(dest, uaddr, size);
+}
+
+static nokprobe_inline int
+probe_mem_read(void *dest, void *src, size_t size)
+{
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)src < TASK_SIZE)
+ return probe_mem_read_user(dest, src, size);
+#endif
+ return copy_from_kernel_nofault(dest, src, size);
+}
+
#endif /* __TRACE_PROBE_KERNEL_H_ */
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 5cea672243f6..00707630788d 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -98,6 +98,26 @@ fetch_store_symstring(unsigned long addr, void *dest, void *base)
return sprint_symbol(__dest, addr);
}
+/* common part of process_fetch_insn*/
+static nokprobe_inline int
+process_common_fetch_insn(struct fetch_insn *code, unsigned long *val)
+{
+ switch (code->op) {
+ case FETCH_OP_IMM:
+ *val = code->immediate;
+ break;
+ case FETCH_OP_COMM:
+ *val = (unsigned long)current->comm;
+ break;
+ case FETCH_OP_DATA:
+ *val = (unsigned long)code->data;
+ break;
+ default:
+ return -EILSEQ;
+ }
+ return 0;
+}
+
/* From the 2nd stage, routine is same */
static nokprobe_inline int
process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val,
@@ -253,31 +273,3 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec,
}
}
}
-
-static inline int
-print_probe_args(struct trace_seq *s, struct probe_arg *args, int nr_args,
- u8 *data, void *field)
-{
- void *p;
- int i, j;
-
- for (i = 0; i < nr_args; i++) {
- struct probe_arg *a = args + i;
-
- trace_seq_printf(s, " %s=", a->name);
- if (likely(!a->count)) {
- if (!a->type->print(s, data + a->offset, field))
- return -ENOMEM;
- continue;
- }
- trace_seq_putc(s, '{');
- p = data + a->offset;
- for (j = 0; j < a->count; j++) {
- if (!a->type->print(s, p, field))
- return -ENOMEM;
- trace_seq_putc(s, j == a->count - 1 ? '}' : ',');
- p += a->type->size;
- }
- }
- return 0;
-}
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8d64b6553aed..8b92e34ff0c8 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -220,6 +220,7 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
{
struct pt_regs *regs = rec;
unsigned long val;
+ int ret;
/* 1st stage: get value from context */
switch (code->op) {
@@ -235,20 +236,16 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
case FETCH_OP_RETVAL:
val = regs_return_value(regs);
break;
- case FETCH_OP_IMM:
- val = code->immediate;
- break;
case FETCH_OP_COMM:
val = FETCH_TOKEN_COMM;
break;
- case FETCH_OP_DATA:
- val = (unsigned long)code->data;
- break;
case FETCH_OP_FOFFS:
val = translate_user_vaddr(code->immediate);
break;
default:
- return -EILSEQ;
+ ret = process_common_fetch_insn(code, &val);
+ if (ret < 0)
+ return ret;
}
code++;
@@ -1042,7 +1039,7 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
data = DATAOF_TRACE_ENTRY(entry, false);
}
- if (print_probe_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0)
+ if (trace_probe_print_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0)
goto out;
trace_seq_putc(s, '\n');
diff --git a/kernel/umh.c b/kernel/umh.c
index 850631518665..60aa9e764a38 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -32,9 +32,6 @@
#include <trace/events/module.h>
-#define CAP_BSET (void *)1
-#define CAP_PI (void *)2
-
static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
static DEFINE_SPINLOCK(umh_sysctl_lock);
@@ -438,21 +435,27 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
- if (wait & UMH_KILLABLE)
- state |= TASK_KILLABLE;
-
if (wait & UMH_FREEZABLE)
state |= TASK_FREEZABLE;
- retval = wait_for_completion_state(&done, state);
- if (!retval)
- goto wait_done;
-
if (wait & UMH_KILLABLE) {
+ retval = wait_for_completion_state(&done, state | TASK_KILLABLE);
+ if (!retval)
+ goto wait_done;
+
/* umh_complete() will see NULL and free sub_info */
if (xchg(&sub_info->complete, NULL))
goto unlock;
+
+ /*
+ * fallthrough; in case of -ERESTARTSYS now do uninterruptible
+ * wait_for_completion_state(). Since umh_complete() shall call
+ * complete() in a moment if xchg() above returned NULL, this
+ * uninterruptible wait_for_completion_state() will not block
+ * SIGKILL'ed processes for long.
+ */
}
+ wait_for_completion_state(&done, state);
wait_done:
retval = sub_info->retval;
@@ -495,9 +498,9 @@ static int proc_cap_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
- unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
- kernel_cap_t new_cap;
- int err, i;
+ unsigned long cap_array[2];
+ kernel_cap_t new_cap, *cap;
+ int err;
if (write && (!capable(CAP_SETPCAP) ||
!capable(CAP_SYS_MODULE)))
@@ -506,16 +509,13 @@ static int proc_cap_handler(struct ctl_table *table, int write,
/*
* convert from the global kernel_cap_t to the ulong array to print to
* userspace if this is a read.
+ *
+ * Legacy format: capabilities are exposed as two 32-bit values
*/
+ cap = table->data;
spin_lock(&umh_sysctl_lock);
- for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
- if (table->data == CAP_BSET)
- cap_array[i] = usermodehelper_bset.cap[i];
- else if (table->data == CAP_PI)
- cap_array[i] = usermodehelper_inheritable.cap[i];
- else
- BUG();
- }
+ cap_array[0] = (u32) cap->val;
+ cap_array[1] = cap->val >> 32;
spin_unlock(&umh_sysctl_lock);
t = *table;
@@ -529,22 +529,15 @@ static int proc_cap_handler(struct ctl_table *table, int write,
if (err < 0)
return err;
- /*
- * convert from the sysctl array of ulongs to the kernel_cap_t
- * internal representation
- */
- for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
- new_cap.cap[i] = cap_array[i];
+ new_cap.val = (u32)cap_array[0];
+ new_cap.val += (u64)cap_array[1] << 32;
/*
* Drop everything not in the new_cap (but don't add things)
*/
if (write) {
spin_lock(&umh_sysctl_lock);
- if (table->data == CAP_BSET)
- usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
- if (table->data == CAP_PI)
- usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+ *cap = cap_intersect(*cap, new_cap);
spin_unlock(&umh_sysctl_lock);
}
@@ -554,15 +547,15 @@ static int proc_cap_handler(struct ctl_table *table, int write,
struct ctl_table usermodehelper_table[] = {
{
.procname = "bset",
- .data = CAP_BSET,
- .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .data = &usermodehelper_bset,
+ .maxlen = 2 * sizeof(unsigned long),
.mode = 0600,
.proc_handler = proc_cap_handler,
},
{
.procname = "inheritable",
- .data = CAP_PI,
- .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .data = &usermodehelper_inheritable,
+ .maxlen = 2 * sizeof(unsigned long),
.mode = 0600,
.proc_handler = proc_cap_handler,
},
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 54211dbd516c..1d8e47bed3f1 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -229,7 +229,7 @@ void __put_user_ns(struct user_namespace *ns)
EXPORT_SYMBOL(__put_user_ns);
/**
- * idmap_key struct holds the information necessary to find an idmapping in a
+ * struct idmap_key - holds the information necessary to find an idmapping in a
* sorted idmap array. It is passed to cmp_map_id() as first argument.
*/
struct idmap_key {
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index a6f9bdd956c3..f10f403104e7 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -273,6 +273,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
if (ret < 0)
goto error;
+ ret = -ENOMEM;
pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL);
if (!pages)
goto error;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 07895deca271..b8b541caed48 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -169,7 +169,9 @@ struct worker_pool {
struct list_head idle_list; /* L: list of idle workers */
struct timer_list idle_timer; /* L: worker idle timeout */
- struct timer_list mayday_timer; /* L: SOS timer for workers */
+ struct work_struct idle_cull_work; /* L: worker idle cleanup */
+
+ struct timer_list mayday_timer; /* L: SOS timer for workers */
/* a workers is either on busy_hash or idle_list, or the manager */
DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
@@ -177,6 +179,7 @@ struct worker_pool {
struct worker *manager; /* L: purely informational */
struct list_head workers; /* A: attached workers */
+ struct list_head dying_workers; /* A: workers about to die */
struct completion *detach_completion; /* all workers detached */
struct ida worker_ida; /* worker IDs for task name */
@@ -326,7 +329,7 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
-/* PL: allowable cpus for unbound wqs and work items */
+/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;
/* CPU where unbound work was last round robin scheduled from this CPU */
@@ -1433,9 +1436,13 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
lockdep_assert_irqs_disabled();
- /* if draining, only works from the same workqueue are allowed */
- if (unlikely(wq->flags & __WQ_DRAINING) &&
- WARN_ON_ONCE(!is_chained_work(wq)))
+ /*
+ * For a draining wq, only works from the same workqueue are
+ * allowed. The __WQ_DESTROYING helps to spot the issue that
+ * queues a new work item to a wq after destroy_workqueue(wq).
+ */
+ if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
+ WARN_ON_ONCE(!is_chained_work(wq))))
return;
rcu_read_lock();
retry:
@@ -1900,7 +1907,7 @@ static void worker_detach_from_pool(struct worker *worker)
list_del(&worker->node);
worker->pool = NULL;
- if (list_empty(&pool->workers))
+ if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))
detach_completion = pool->detach_completion;
mutex_unlock(&wq_pool_attach_mutex);
@@ -1972,21 +1979,55 @@ fail:
return NULL;
}
+static void unbind_worker(struct worker *worker)
+{
+ lockdep_assert_held(&wq_pool_attach_mutex);
+
+ kthread_set_per_cpu(worker->task, -1);
+ if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
+ else
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+}
+
+static void wake_dying_workers(struct list_head *cull_list)
+{
+ struct worker *worker, *tmp;
+
+ list_for_each_entry_safe(worker, tmp, cull_list, entry) {
+ list_del_init(&worker->entry);
+ unbind_worker(worker);
+ /*
+ * If the worker was somehow already running, then it had to be
+ * in pool->idle_list when set_worker_dying() happened or we
+ * wouldn't have gotten here.
+ *
+ * Thus, the worker must either have observed the WORKER_DIE
+ * flag, or have set its state to TASK_IDLE. Either way, the
+ * below will be observed by the worker and is safe to do
+ * outside of pool->lock.
+ */
+ wake_up_process(worker->task);
+ }
+}
+
/**
- * destroy_worker - destroy a workqueue worker
+ * set_worker_dying - Tag a worker for destruction
* @worker: worker to be destroyed
+ * @list: transfer worker away from its pool->idle_list and into list
*
- * Destroy @worker and adjust @pool stats accordingly. The worker should
- * be idle.
+ * Tag @worker for destruction and adjust @pool stats accordingly. The worker
+ * should be idle.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*/
-static void destroy_worker(struct worker *worker)
+static void set_worker_dying(struct worker *worker, struct list_head *list)
{
struct worker_pool *pool = worker->pool;
lockdep_assert_held(&pool->lock);
+ lockdep_assert_held(&wq_pool_attach_mutex);
/* sanity check frenzy */
if (WARN_ON(worker->current_work) ||
@@ -1997,34 +2038,94 @@ static void destroy_worker(struct worker *worker)
pool->nr_workers--;
pool->nr_idle--;
- list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
- wake_up_process(worker->task);
+
+ list_move(&worker->entry, list);
+ list_move(&worker->node, &pool->dying_workers);
}
+/**
+ * idle_worker_timeout - check if some idle workers can now be deleted.
+ * @t: The pool's idle_timer that just expired
+ *
+ * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in
+ * worker_leave_idle(), as a worker flicking between idle and active while its
+ * pool is at the too_many_workers() tipping point would cause too much timer
+ * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let
+ * it expire and re-evaluate things from there.
+ */
static void idle_worker_timeout(struct timer_list *t)
{
struct worker_pool *pool = from_timer(pool, t, idle_timer);
+ bool do_cull = false;
+
+ if (work_pending(&pool->idle_cull_work))
+ return;
raw_spin_lock_irq(&pool->lock);
- while (too_many_workers(pool)) {
+ if (too_many_workers(pool)) {
struct worker *worker;
unsigned long expires;
/* idle_list is kept in LIFO order, check the last one */
worker = list_entry(pool->idle_list.prev, struct worker, entry);
expires = worker->last_active + IDLE_WORKER_TIMEOUT;
+ do_cull = !time_before(jiffies, expires);
+
+ if (!do_cull)
+ mod_timer(&pool->idle_timer, expires);
+ }
+ raw_spin_unlock_irq(&pool->lock);
+
+ if (do_cull)
+ queue_work(system_unbound_wq, &pool->idle_cull_work);
+}
+
+/**
+ * idle_cull_fn - cull workers that have been idle for too long.
+ * @work: the pool's work for handling these idle workers
+ *
+ * This goes through a pool's idle workers and gets rid of those that have been
+ * idle for at least IDLE_WORKER_TIMEOUT seconds.
+ *
+ * We don't want to disturb isolated CPUs because of a pcpu kworker being
+ * culled, so this also resets worker affinity. This requires a sleepable
+ * context, hence the split between timer callback and work item.
+ */
+static void idle_cull_fn(struct work_struct *work)
+{
+ struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
+ struct list_head cull_list;
+
+ INIT_LIST_HEAD(&cull_list);
+ /*
+ * Grabbing wq_pool_attach_mutex here ensures an already-running worker
+ * cannot proceed beyong worker_detach_from_pool() in its self-destruct
+ * path. This is required as a previously-preempted worker could run after
+ * set_worker_dying() has happened but before wake_dying_workers() did.
+ */
+ mutex_lock(&wq_pool_attach_mutex);
+ raw_spin_lock_irq(&pool->lock);
+
+ while (too_many_workers(pool)) {
+ struct worker *worker;
+ unsigned long expires;
+
+ worker = list_entry(pool->idle_list.prev, struct worker, entry);
+ expires = worker->last_active + IDLE_WORKER_TIMEOUT;
if (time_before(jiffies, expires)) {
mod_timer(&pool->idle_timer, expires);
break;
}
- destroy_worker(worker);
+ set_worker_dying(worker, &cull_list);
}
raw_spin_unlock_irq(&pool->lock);
+ wake_dying_workers(&cull_list);
+ mutex_unlock(&wq_pool_attach_mutex);
}
static void send_mayday(struct work_struct *work)
@@ -2388,12 +2489,12 @@ woke_up:
/* am I supposed to die? */
if (unlikely(worker->flags & WORKER_DIE)) {
raw_spin_unlock_irq(&pool->lock);
- WARN_ON_ONCE(!list_empty(&worker->entry));
set_pf_worker(false);
set_task_comm(worker->task, "kworker/dying");
ida_free(&pool->worker_ida, worker->id);
worker_detach_from_pool(worker);
+ WARN_ON_ONCE(!list_empty(&worker->entry));
kfree(worker);
return 0;
}
@@ -3462,10 +3563,12 @@ static int init_worker_pool(struct worker_pool *pool)
hash_init(pool->busy_hash);
timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
+ INIT_WORK(&pool->idle_cull_work, idle_cull_fn);
timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
INIT_LIST_HEAD(&pool->workers);
+ INIT_LIST_HEAD(&pool->dying_workers);
ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
@@ -3540,18 +3643,6 @@ static void rcu_free_pool(struct rcu_head *rcu)
kfree(pool);
}
-/* This returns with the lock held on success (pool manager is inactive). */
-static bool wq_manager_inactive(struct worker_pool *pool)
-{
- raw_spin_lock_irq(&pool->lock);
-
- if (pool->flags & POOL_MANAGER_ACTIVE) {
- raw_spin_unlock_irq(&pool->lock);
- return false;
- }
- return true;
-}
-
/**
* put_unbound_pool - put a worker_pool
* @pool: worker_pool to put
@@ -3566,8 +3657,11 @@ static bool wq_manager_inactive(struct worker_pool *pool)
static void put_unbound_pool(struct worker_pool *pool)
{
DECLARE_COMPLETION_ONSTACK(detach_completion);
+ struct list_head cull_list;
struct worker *worker;
+ INIT_LIST_HEAD(&cull_list);
+
lockdep_assert_held(&wq_pool_mutex);
if (--pool->refcnt)
@@ -3587,20 +3681,38 @@ static void put_unbound_pool(struct worker_pool *pool)
* Become the manager and destroy all workers. This prevents
* @pool's workers from blocking on attach_mutex. We're the last
* manager and @pool gets freed with the flag set.
- * Because of how wq_manager_inactive() works, we will hold the
- * spinlock after a successful wait.
+ *
+ * Having a concurrent manager is quite unlikely to happen as we can
+ * only get here with
+ * pwq->refcnt == pool->refcnt == 0
+ * which implies no work queued to the pool, which implies no worker can
+ * become the manager. However a worker could have taken the role of
+ * manager before the refcnts dropped to 0, since maybe_create_worker()
+ * drops pool->lock
*/
- rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool),
- TASK_UNINTERRUPTIBLE);
- pool->flags |= POOL_MANAGER_ACTIVE;
+ while (true) {
+ rcuwait_wait_event(&manager_wait,
+ !(pool->flags & POOL_MANAGER_ACTIVE),
+ TASK_UNINTERRUPTIBLE);
+
+ mutex_lock(&wq_pool_attach_mutex);
+ raw_spin_lock_irq(&pool->lock);
+ if (!(pool->flags & POOL_MANAGER_ACTIVE)) {
+ pool->flags |= POOL_MANAGER_ACTIVE;
+ break;
+ }
+ raw_spin_unlock_irq(&pool->lock);
+ mutex_unlock(&wq_pool_attach_mutex);
+ }
while ((worker = first_idle_worker(pool)))
- destroy_worker(worker);
+ set_worker_dying(worker, &cull_list);
WARN_ON(pool->nr_workers || pool->nr_idle);
raw_spin_unlock_irq(&pool->lock);
- mutex_lock(&wq_pool_attach_mutex);
- if (!list_empty(&pool->workers))
+ wake_dying_workers(&cull_list);
+
+ if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))
pool->detach_completion = &detach_completion;
mutex_unlock(&wq_pool_attach_mutex);
@@ -3609,6 +3721,7 @@ static void put_unbound_pool(struct worker_pool *pool)
/* shut down the timers */
del_timer_sync(&pool->idle_timer);
+ cancel_work_sync(&pool->idle_cull_work);
del_timer_sync(&pool->mayday_timer);
/* RCU protected to allow dereferences from get_work_pool() */
@@ -3952,7 +4065,8 @@ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
/* allocate the attrs and pwqs for later installation */
static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct *wq,
- const struct workqueue_attrs *attrs)
+ const struct workqueue_attrs *attrs,
+ const cpumask_var_t unbound_cpumask)
{
struct apply_wqattrs_ctx *ctx;
struct workqueue_attrs *new_attrs, *tmp_attrs;
@@ -3968,14 +4082,15 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
goto out_free;
/*
- * Calculate the attrs of the default pwq.
+ * Calculate the attrs of the default pwq with unbound_cpumask
+ * which is wq_unbound_cpumask or to set to wq_unbound_cpumask.
* If the user configured cpumask doesn't overlap with the
* wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
*/
copy_workqueue_attrs(new_attrs, attrs);
- cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
+ cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask);
if (unlikely(cpumask_empty(new_attrs->cpumask)))
- cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);
+ cpumask_copy(new_attrs->cpumask, unbound_cpumask);
/*
* We may create multiple pwqs with differing cpumasks. Make a
@@ -4072,7 +4187,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
wq->flags &= ~__WQ_ORDERED;
}
- ctx = apply_wqattrs_prepare(wq, attrs);
+ ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
if (!ctx)
return -ENOMEM;
@@ -4414,6 +4529,11 @@ void destroy_workqueue(struct workqueue_struct *wq)
*/
workqueue_sysfs_unregister(wq);
+ /* mark the workqueue destruction is in progress */
+ mutex_lock(&wq->mutex);
+ wq->flags |= __WQ_DESTROYING;
+ mutex_unlock(&wq->mutex);
+
/* drain it before proceeding with destruction */
drain_workqueue(wq);
@@ -4709,22 +4829,53 @@ static void pr_cont_pool_info(struct worker_pool *pool)
pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
}
-static void pr_cont_work(bool comma, struct work_struct *work)
+struct pr_cont_work_struct {
+ bool comma;
+ work_func_t func;
+ long ctr;
+};
+
+static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp)
+{
+ if (!pcwsp->ctr)
+ goto out_record;
+ if (func == pcwsp->func) {
+ pcwsp->ctr++;
+ return;
+ }
+ if (pcwsp->ctr == 1)
+ pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func);
+ else
+ pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func);
+ pcwsp->ctr = 0;
+out_record:
+ if ((long)func == -1L)
+ return;
+ pcwsp->comma = comma;
+ pcwsp->func = func;
+ pcwsp->ctr = 1;
+}
+
+static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp)
{
if (work->func == wq_barrier_func) {
struct wq_barrier *barr;
barr = container_of(work, struct wq_barrier, work);
+ pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
pr_cont("%s BAR(%d)", comma ? "," : "",
task_pid_nr(barr->task));
} else {
- pr_cont("%s %ps", comma ? "," : "", work->func);
+ if (!comma)
+ pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
+ pr_cont_work_flush(comma, work->func, pcwsp);
}
}
static void show_pwq(struct pool_workqueue *pwq)
{
+ struct pr_cont_work_struct pcws = { .ctr = 0, };
struct worker_pool *pool = pwq->pool;
struct work_struct *work;
struct worker *worker;
@@ -4757,7 +4908,8 @@ static void show_pwq(struct pool_workqueue *pwq)
worker->rescue_wq ? "(RESCUER)" : "",
worker->current_func);
list_for_each_entry(work, &worker->scheduled, entry)
- pr_cont_work(false, work);
+ pr_cont_work(false, work, &pcws);
+ pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
comma = true;
}
pr_cont("\n");
@@ -4777,9 +4929,10 @@ static void show_pwq(struct pool_workqueue *pwq)
if (get_work_pwq(work) != pwq)
continue;
- pr_cont_work(comma, work);
+ pr_cont_work(comma, work, &pcws);
comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
}
+ pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
pr_cont("\n");
}
@@ -4788,9 +4941,10 @@ static void show_pwq(struct pool_workqueue *pwq)
pr_info(" inactive:");
list_for_each_entry(work, &pwq->inactive_works, entry) {
- pr_cont_work(comma, work);
+ pr_cont_work(comma, work, &pcws);
comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
}
+ pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
pr_cont("\n");
}
}
@@ -5006,13 +5160,8 @@ static void unbind_workers(int cpu)
raw_spin_unlock_irq(&pool->lock);
- for_each_pool_worker(worker, pool) {
- kthread_set_per_cpu(worker->task, -1);
- if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
- else
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
- }
+ for_each_pool_worker(worker, pool)
+ unbind_worker(worker);
mutex_unlock(&wq_pool_attach_mutex);
}
@@ -5334,7 +5483,7 @@ out_unlock:
}
#endif /* CONFIG_FREEZER */
-static int workqueue_apply_unbound_cpumask(void)
+static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
{
LIST_HEAD(ctxs);
int ret = 0;
@@ -5350,7 +5499,7 @@ static int workqueue_apply_unbound_cpumask(void)
if (wq->flags & __WQ_ORDERED)
continue;
- ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
+ ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
if (!ctx) {
ret = -ENOMEM;
break;
@@ -5365,6 +5514,11 @@ static int workqueue_apply_unbound_cpumask(void)
apply_wqattrs_cleanup(ctx);
}
+ if (!ret) {
+ mutex_lock(&wq_pool_attach_mutex);
+ cpumask_copy(wq_unbound_cpumask, unbound_cpumask);
+ mutex_unlock(&wq_pool_attach_mutex);
+ }
return ret;
}
@@ -5383,7 +5537,6 @@ static int workqueue_apply_unbound_cpumask(void)
int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
{
int ret = -EINVAL;
- cpumask_var_t saved_cpumask;
/*
* Not excluding isolated cpus on purpose.
@@ -5397,23 +5550,8 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
goto out_unlock;
}
- if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) {
- ret = -ENOMEM;
- goto out_unlock;
- }
-
- /* save the old wq_unbound_cpumask. */
- cpumask_copy(saved_cpumask, wq_unbound_cpumask);
-
- /* update wq_unbound_cpumask at first and apply it to wqs. */
- cpumask_copy(wq_unbound_cpumask, cpumask);
- ret = workqueue_apply_unbound_cpumask();
-
- /* restore the wq_unbound_cpumask when failed. */
- if (ret < 0)
- cpumask_copy(wq_unbound_cpumask, saved_cpumask);
+ ret = workqueue_apply_unbound_cpumask(cpumask);
- free_cpumask_var(saved_cpumask);
out_unlock:
apply_wqattrs_unlock();
}