summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/btf.c481
-rw-r--r--kernel/bpf/core.c232
-rw-r--r--kernel/bpf/hashtab.c13
-rw-r--r--kernel/bpf/local_storage.c6
-rw-r--r--kernel/bpf/lpm_trie.c59
-rw-r--r--kernel/bpf/offload.c76
-rw-r--r--kernel/bpf/queue_stack_maps.c16
-rw-r--r--kernel/bpf/syscall.c120
-rw-r--r--kernel/bpf/verifier.c456
-rw-r--r--kernel/cpu.c15
-rw-r--r--kernel/debug/kdb/kdb_bt.c4
-rw-r--r--kernel/debug/kdb/kdb_io.c15
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c35
-rw-r--r--kernel/debug/kdb/kdb_private.h2
-rw-r--r--kernel/debug/kdb/kdb_support.c28
-rw-r--r--kernel/dma/swiotlb.c3
-rw-r--r--kernel/events/uprobes.c14
-rw-r--r--kernel/kcov.c4
-rw-r--r--kernel/ptrace.c10
-rw-r--r--kernel/resource.c19
-rw-r--r--kernel/sched/core.c24
-rw-r--r--kernel/sched/fair.c66
-rw-r--r--kernel/sched/psi.c71
-rw-r--r--kernel/sched/sched.h4
-rw-r--r--kernel/sched/stats.h8
-rw-r--r--kernel/stackleak.c6
-rw-r--r--kernel/time/posix-cpu-timers.c3
-rw-r--r--kernel/trace/bpf_trace.c8
-rw-r--r--kernel/trace/ftrace.c7
-rw-r--r--kernel/trace/trace.h57
-rw-r--r--kernel/trace/trace_functions_graph.c53
-rw-r--r--kernel/trace/trace_irqsoff.c2
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/user_namespace.c12
35 files changed, 1570 insertions, 365 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ee4c82667d65..bf34933cc413 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5,6 +5,7 @@
#include <uapi/linux/types.h>
#include <linux/seq_file.h>
#include <linux/compiler.h>
+#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/anon_inodes.h>
@@ -259,6 +260,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_VOLATILE] = "VOLATILE",
[BTF_KIND_CONST] = "CONST",
[BTF_KIND_RESTRICT] = "RESTRICT",
+ [BTF_KIND_FUNC] = "FUNC",
+ [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO",
};
struct btf_kind_operations {
@@ -281,6 +284,9 @@ struct btf_kind_operations {
static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
static struct btf_type btf_void;
+static int btf_resolve(struct btf_verifier_env *env,
+ const struct btf_type *t, u32 type_id);
+
static bool btf_type_is_modifier(const struct btf_type *t)
{
/* Some of them is not strictly a C modifier
@@ -306,15 +312,33 @@ static bool btf_type_is_modifier(const struct btf_type *t)
static bool btf_type_is_void(const struct btf_type *t)
{
- /* void => no type and size info.
- * Hence, FWD is also treated as void.
- */
- return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+ return t == &btf_void;
+}
+
+static bool btf_type_is_fwd(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+}
+
+static bool btf_type_is_func(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC;
+}
+
+static bool btf_type_is_func_proto(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
+}
+
+static bool btf_type_nosize(const struct btf_type *t)
+{
+ return btf_type_is_void(t) || btf_type_is_fwd(t) ||
+ btf_type_is_func(t) || btf_type_is_func_proto(t);
}
-static bool btf_type_is_void_or_null(const struct btf_type *t)
+static bool btf_type_nosize_or_null(const struct btf_type *t)
{
- return !t || btf_type_is_void(t);
+ return !t || btf_type_nosize(t);
}
/* union is only a special case of struct:
@@ -420,13 +444,37 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
return kind_ops[BTF_INFO_KIND(t->info)];
}
-static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
+bool btf_name_offset_valid(const struct btf *btf, u32 offset)
{
return BTF_STR_OFFSET_VALID(offset) &&
offset < btf->hdr.str_len;
}
-static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+/* Only C-style identifier is permitted. This can be relaxed if
+ * necessary.
+ */
+static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
+{
+ /* offset must be valid */
+ const char *src = &btf->strings[offset];
+ const char *src_limit;
+
+ if (!isalpha(*src) && *src != '_')
+ return false;
+
+ /* set a limit on identifier length */
+ src_limit = src + KSYM_NAME_LEN;
+ src++;
+ while (*src && src < src_limit) {
+ if (!isalnum(*src) && *src != '_')
+ return false;
+ src++;
+ }
+
+ return !*src;
+}
+
+const char *btf_name_by_offset(const struct btf *btf, u32 offset)
{
if (!offset)
return "(anon)";
@@ -436,7 +484,7 @@ static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
return "(invalid-name-offset)";
}
-static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
+const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
{
if (type_id > btf->nr_types)
return NULL;
@@ -740,11 +788,15 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
/* int, enum or void is a sink */
return !btf_type_needs_resolve(next_type);
case RESOLVE_PTR:
- /* int, enum, void, struct or array is a sink for ptr */
+ /* int, enum, void, struct, array, func or func_proto is a sink
+ * for ptr
+ */
return !btf_type_is_modifier(next_type) &&
!btf_type_is_ptr(next_type);
case RESOLVE_STRUCT_OR_ARRAY:
- /* int, enum, void or ptr is a sink for struct and array */
+ /* int, enum, void, ptr, func or func_proto is a sink
+ * for struct and array
+ */
return !btf_type_is_modifier(next_type) &&
!btf_type_is_array(next_type) &&
!btf_type_is_struct(next_type);
@@ -826,7 +878,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
u32 size = 0;
size_type = btf_type_by_id(btf, size_type_id);
- if (btf_type_is_void_or_null(size_type))
+ if (btf_type_nosize_or_null(size_type))
return NULL;
if (btf_type_has_size(size_type)) {
@@ -842,7 +894,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
size = btf->resolved_sizes[size_type_id];
size_type_id = btf->resolved_ids[size_type_id];
size_type = btf_type_by_id(btf, size_type_id);
- if (btf_type_is_void(size_type))
+ if (btf_type_nosize_or_null(size_type))
return NULL;
}
@@ -1143,6 +1195,22 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ /* typedef type must have a valid name, and other ref types,
+ * volatile, const, restrict, should have a null name.
+ */
+ if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
+ if (!t->name_off ||
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+ } else {
+ if (t->name_off) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+ }
+
btf_verifier_log_type(env, t, NULL);
return 0;
@@ -1163,10 +1231,6 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
return -EINVAL;
}
- /* "typedef void new_void", "const void"...etc */
- if (btf_type_is_void(next_type))
- goto resolved;
-
if (!env_type_is_resolve_sink(env, next_type) &&
!env_type_is_resolved(env, next_type_id))
return env_stack_push(env, next_type, next_type_id);
@@ -1177,13 +1241,18 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
* save us a few type-following when we use it later (e.g. in
* pretty print).
*/
- if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
- !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
- btf_verifier_log_type(env, v->t, "Invalid type_id");
- return -EINVAL;
+ if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) {
+ if (env_type_is_resolved(env, next_type_id))
+ next_type = btf_type_id_resolve(btf, &next_type_id);
+
+ /* "typedef void new_void", "const void"...etc */
+ if (!btf_type_is_void(next_type) &&
+ !btf_type_is_fwd(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
}
-resolved:
env_stack_pop_resolved(env, next_type_id, next_type_size);
return 0;
@@ -1196,7 +1265,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
const struct btf_type *t = v->t;
u32 next_type_id = t->type;
struct btf *btf = env->btf;
- u32 next_type_size = 0;
next_type = btf_type_by_id(btf, next_type_id);
if (!next_type) {
@@ -1204,10 +1272,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
return -EINVAL;
}
- /* "void *" */
- if (btf_type_is_void(next_type))
- goto resolved;
-
if (!env_type_is_resolve_sink(env, next_type) &&
!env_type_is_resolved(env, next_type_id))
return env_stack_push(env, next_type, next_type_id);
@@ -1234,13 +1298,18 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
resolved_type_id);
}
- if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
- !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
- btf_verifier_log_type(env, v->t, "Invalid type_id");
- return -EINVAL;
+ if (!btf_type_id_size(btf, &next_type_id, NULL)) {
+ if (env_type_is_resolved(env, next_type_id))
+ next_type = btf_type_id_resolve(btf, &next_type_id);
+
+ if (!btf_type_is_void(next_type) &&
+ !btf_type_is_fwd(next_type) &&
+ !btf_type_is_func_proto(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
}
-resolved:
env_stack_pop_resolved(env, next_type_id, 0);
return 0;
@@ -1300,6 +1369,13 @@ static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ /* fwd type must have a valid name */
+ if (!t->name_off ||
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
btf_verifier_log_type(env, t, NULL);
return 0;
@@ -1356,6 +1432,12 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ /* array type should not have a name */
+ if (t->name_off) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
@@ -1396,7 +1478,7 @@ static int btf_array_resolve(struct btf_verifier_env *env,
/* Check array->index_type */
index_type_id = array->index_type;
index_type = btf_type_by_id(btf, index_type_id);
- if (btf_type_is_void_or_null(index_type)) {
+ if (btf_type_nosize_or_null(index_type)) {
btf_verifier_log_type(env, v->t, "Invalid index");
return -EINVAL;
}
@@ -1415,7 +1497,7 @@ static int btf_array_resolve(struct btf_verifier_env *env,
/* Check array->type */
elem_type_id = array->type;
elem_type = btf_type_by_id(btf, elem_type_id);
- if (btf_type_is_void_or_null(elem_type)) {
+ if (btf_type_nosize_or_null(elem_type)) {
btf_verifier_log_type(env, v->t,
"Invalid elem");
return -EINVAL;
@@ -1532,6 +1614,13 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ /* struct type either no name or a valid one */
+ if (t->name_off &&
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
btf_verifier_log_type(env, t, NULL);
last_offset = 0;
@@ -1543,6 +1632,12 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ /* struct member either no name or a valid one */
+ if (member->name_off &&
+ !btf_name_valid_identifier(btf, member->name_off)) {
+ btf_verifier_log_member(env, t, member, "Invalid name");
+ return -EINVAL;
+ }
/* A member cannot be in type void */
if (!member->type || !BTF_TYPE_ID_VALID(member->type)) {
btf_verifier_log_member(env, t, member,
@@ -1568,7 +1663,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) {
btf_verifier_log_member(env, t, member,
- "Memmber bits_offset exceeds its struct size");
+ "Member bits_offset exceeds its struct size");
return -EINVAL;
}
@@ -1615,7 +1710,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
const struct btf_type *member_type = btf_type_by_id(env->btf,
member_type_id);
- if (btf_type_is_void_or_null(member_type)) {
+ if (btf_type_nosize_or_null(member_type)) {
btf_verifier_log_member(env, v->t, member,
"Invalid member");
return -EINVAL;
@@ -1730,6 +1825,13 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ /* enum type either no name or a valid one */
+ if (t->name_off &&
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
btf_verifier_log_type(env, t, NULL);
for (i = 0; i < nr_enums; i++) {
@@ -1739,6 +1841,14 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ /* enum member must have a valid name */
+ if (!enums[i].name_off ||
+ !btf_name_valid_identifier(btf, enums[i].name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+
btf_verifier_log(env, "\t%s val=%d\n",
btf_name_by_offset(btf, enums[i].name_off),
enums[i].val);
@@ -1780,6 +1890,232 @@ static struct btf_kind_operations enum_ops = {
.seq_show = btf_enum_seq_show,
};
+static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ u32 meta_needed = btf_type_vlen(t) * sizeof(struct btf_param);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (t->name_off) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static void btf_func_proto_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_param *args = (const struct btf_param *)(t + 1);
+ u16 nr_args = btf_type_vlen(t), i;
+
+ btf_verifier_log(env, "return=%u args=(", t->type);
+ if (!nr_args) {
+ btf_verifier_log(env, "void");
+ goto done;
+ }
+
+ if (nr_args == 1 && !args[0].type) {
+ /* Only one vararg */
+ btf_verifier_log(env, "vararg");
+ goto done;
+ }
+
+ btf_verifier_log(env, "%u %s", args[0].type,
+ btf_name_by_offset(env->btf,
+ args[0].name_off));
+ for (i = 1; i < nr_args - 1; i++)
+ btf_verifier_log(env, ", %u %s", args[i].type,
+ btf_name_by_offset(env->btf,
+ args[i].name_off));
+
+ if (nr_args > 1) {
+ const struct btf_param *last_arg = &args[nr_args - 1];
+
+ if (last_arg->type)
+ btf_verifier_log(env, ", %u %s", last_arg->type,
+ btf_name_by_offset(env->btf,
+ last_arg->name_off));
+ else
+ btf_verifier_log(env, ", vararg");
+ }
+
+done:
+ btf_verifier_log(env, ")");
+}
+
+static struct btf_kind_operations func_proto_ops = {
+ .check_meta = btf_func_proto_check_meta,
+ .resolve = btf_df_resolve,
+ /*
+ * BTF_KIND_FUNC_PROTO cannot be directly referred by
+ * a struct's member.
+ *
+ * It should be a funciton pointer instead.
+ * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO)
+ *
+ * Hence, there is no btf_func_check_member().
+ */
+ .check_member = btf_df_check_member,
+ .log_details = btf_func_proto_log,
+ .seq_show = btf_df_seq_show,
+};
+
+static s32 btf_func_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ if (!t->name_off ||
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static struct btf_kind_operations func_ops = {
+ .check_meta = btf_func_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_df_check_member,
+ .log_details = btf_ref_type_log,
+ .seq_show = btf_df_seq_show,
+};
+
+static int btf_func_proto_check(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_type *ret_type;
+ const struct btf_param *args;
+ const struct btf *btf;
+ u16 nr_args, i;
+ int err;
+
+ btf = env->btf;
+ args = (const struct btf_param *)(t + 1);
+ nr_args = btf_type_vlen(t);
+
+ /* Check func return type which could be "void" (t->type == 0) */
+ if (t->type) {
+ u32 ret_type_id = t->type;
+
+ ret_type = btf_type_by_id(btf, ret_type_id);
+ if (!ret_type) {
+ btf_verifier_log_type(env, t, "Invalid return type");
+ return -EINVAL;
+ }
+
+ if (btf_type_needs_resolve(ret_type) &&
+ !env_type_is_resolved(env, ret_type_id)) {
+ err = btf_resolve(env, ret_type, ret_type_id);
+ if (err)
+ return err;
+ }
+
+ /* Ensure the return type is a type that has a size */
+ if (!btf_type_id_size(btf, &ret_type_id, NULL)) {
+ btf_verifier_log_type(env, t, "Invalid return type");
+ return -EINVAL;
+ }
+ }
+
+ if (!nr_args)
+ return 0;
+
+ /* Last func arg type_id could be 0 if it is a vararg */
+ if (!args[nr_args - 1].type) {
+ if (args[nr_args - 1].name_off) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u",
+ nr_args);
+ return -EINVAL;
+ }
+ nr_args--;
+ }
+
+ err = 0;
+ for (i = 0; i < nr_args; i++) {
+ const struct btf_type *arg_type;
+ u32 arg_type_id;
+
+ arg_type_id = args[i].type;
+ arg_type = btf_type_by_id(btf, arg_type_id);
+ if (!arg_type) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ err = -EINVAL;
+ break;
+ }
+
+ if (args[i].name_off &&
+ (!btf_name_offset_valid(btf, args[i].name_off) ||
+ !btf_name_valid_identifier(btf, args[i].name_off))) {
+ btf_verifier_log_type(env, t,
+ "Invalid arg#%u", i + 1);
+ err = -EINVAL;
+ break;
+ }
+
+ if (btf_type_needs_resolve(arg_type) &&
+ !env_type_is_resolved(env, arg_type_id)) {
+ err = btf_resolve(env, arg_type, arg_type_id);
+ if (err)
+ break;
+ }
+
+ if (!btf_type_id_size(btf, &arg_type_id, NULL)) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ err = -EINVAL;
+ break;
+ }
+ }
+
+ return err;
+}
+
+static int btf_func_check(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_type *proto_type;
+ const struct btf_param *args;
+ const struct btf *btf;
+ u16 nr_args, i;
+
+ btf = env->btf;
+ proto_type = btf_type_by_id(btf, t->type);
+
+ if (!proto_type || !btf_type_is_func_proto(proto_type)) {
+ btf_verifier_log_type(env, t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ args = (const struct btf_param *)(proto_type + 1);
+ nr_args = btf_type_vlen(proto_type);
+ for (i = 0; i < nr_args; i++) {
+ if (!args[i].name_off && args[i].type) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_INT] = &int_ops,
[BTF_KIND_PTR] = &ptr_ops,
@@ -1792,6 +2128,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_VOLATILE] = &modifier_ops,
[BTF_KIND_CONST] = &modifier_ops,
[BTF_KIND_RESTRICT] = &modifier_ops,
+ [BTF_KIND_FUNC] = &func_ops,
+ [BTF_KIND_FUNC_PROTO] = &func_proto_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -1863,30 +2201,6 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
return 0;
}
-static int btf_resolve(struct btf_verifier_env *env,
- const struct btf_type *t, u32 type_id)
-{
- const struct resolve_vertex *v;
- int err = 0;
-
- env->resolve_mode = RESOLVE_TBD;
- env_stack_push(env, t, type_id);
- while (!err && (v = env_stack_peak(env))) {
- env->log_type_id = v->type_id;
- err = btf_type_ops(v->t)->resolve(env, v);
- }
-
- env->log_type_id = type_id;
- if (err == -E2BIG)
- btf_verifier_log_type(env, t,
- "Exceeded max resolving depth:%u",
- MAX_RESOLVE_DEPTH);
- else if (err == -EEXIST)
- btf_verifier_log_type(env, t, "Loop detected");
-
- return err;
-}
-
static bool btf_resolve_valid(struct btf_verifier_env *env,
const struct btf_type *t,
u32 type_id)
@@ -1920,6 +2234,39 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
return false;
}
+static int btf_resolve(struct btf_verifier_env *env,
+ const struct btf_type *t, u32 type_id)
+{
+ u32 save_log_type_id = env->log_type_id;
+ const struct resolve_vertex *v;
+ int err = 0;
+
+ env->resolve_mode = RESOLVE_TBD;
+ env_stack_push(env, t, type_id);
+ while (!err && (v = env_stack_peak(env))) {
+ env->log_type_id = v->type_id;
+ err = btf_type_ops(v->t)->resolve(env, v);
+ }
+
+ env->log_type_id = type_id;
+ if (err == -E2BIG) {
+ btf_verifier_log_type(env, t,
+ "Exceeded max resolving depth:%u",
+ MAX_RESOLVE_DEPTH);
+ } else if (err == -EEXIST) {
+ btf_verifier_log_type(env, t, "Loop detected");
+ }
+
+ /* Final sanity check */
+ if (!err && !btf_resolve_valid(env, t, type_id)) {
+ btf_verifier_log_type(env, t, "Invalid resolve state");
+ err = -EINVAL;
+ }
+
+ env->log_type_id = save_log_type_id;
+ return err;
+}
+
static int btf_check_all_types(struct btf_verifier_env *env)
{
struct btf *btf = env->btf;
@@ -1942,10 +2289,16 @@ static int btf_check_all_types(struct btf_verifier_env *env)
return err;
}
- if (btf_type_needs_resolve(t) &&
- !btf_resolve_valid(env, t, type_id)) {
- btf_verifier_log_type(env, t, "Invalid resolve state");
- return -EINVAL;
+ if (btf_type_is_func_proto(t)) {
+ err = btf_func_proto_check(env, t);
+ if (err)
+ return err;
+ }
+
+ if (btf_type_is_func(t)) {
+ err = btf_func_check(env, t);
+ if (err)
+ return err;
}
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1a796e0799ec..5cdd8da0e7f2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -21,12 +21,14 @@
* Kris Katterjohn - Added many additional checks in bpf_check_classic()
*/
+#include <uapi/linux/btf.h>
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <linux/random.h>
#include <linux/moduleloader.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/frame.h>
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
@@ -103,6 +105,91 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
}
EXPORT_SYMBOL_GPL(bpf_prog_alloc);
+int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
+{
+ if (!prog->aux->nr_linfo || !prog->jit_requested)
+ return 0;
+
+ prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo,
+ sizeof(*prog->aux->jited_linfo),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!prog->aux->jited_linfo)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void bpf_prog_free_jited_linfo(struct bpf_prog *prog)
+{
+ kfree(prog->aux->jited_linfo);
+ prog->aux->jited_linfo = NULL;
+}
+
+void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog)
+{
+ if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0])
+ bpf_prog_free_jited_linfo(prog);
+}
+
+/* The jit engine is responsible to provide an array
+ * for insn_off to the jited_off mapping (insn_to_jit_off).
+ *
+ * The idx to this array is the insn_off. Hence, the insn_off
+ * here is relative to the prog itself instead of the main prog.
+ * This array has one entry for each xlated bpf insn.
+ *
+ * jited_off is the byte off to the last byte of the jited insn.
+ *
+ * Hence, with
+ * insn_start:
+ * The first bpf insn off of the prog. The insn off
+ * here is relative to the main prog.
+ * e.g. if prog is a subprog, insn_start > 0
+ * linfo_idx:
+ * The prog's idx to prog->aux->linfo and jited_linfo
+ *
+ * jited_linfo[linfo_idx] = prog->bpf_func
+ *
+ * For i > linfo_idx,
+ *
+ * jited_linfo[i] = prog->bpf_func +
+ * insn_to_jit_off[linfo[i].insn_off - insn_start - 1]
+ */
+void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
+ const u32 *insn_to_jit_off)
+{
+ u32 linfo_idx, insn_start, insn_end, nr_linfo, i;
+ const struct bpf_line_info *linfo;
+ void **jited_linfo;
+
+ if (!prog->aux->jited_linfo)
+ /* Userspace did not provide linfo */
+ return;
+
+ linfo_idx = prog->aux->linfo_idx;
+ linfo = &prog->aux->linfo[linfo_idx];
+ insn_start = linfo[0].insn_off;
+ insn_end = insn_start + prog->len;
+
+ jited_linfo = &prog->aux->jited_linfo[linfo_idx];
+ jited_linfo[0] = prog->bpf_func;
+
+ nr_linfo = prog->aux->nr_linfo - linfo_idx;
+
+ for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++)
+ /* The verifier ensures that linfo[i].insn_off is
+ * strictly increasing
+ */
+ jited_linfo[i] = prog->bpf_func +
+ insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
+}
+
+void bpf_prog_free_linfo(struct bpf_prog *prog)
+{
+ bpf_prog_free_jited_linfo(prog);
+ kvfree(prog->aux->linfo);
+}
+
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
@@ -292,6 +379,26 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
return ret;
}
+static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta)
+{
+ struct bpf_line_info *linfo;
+ u32 i, nr_linfo;
+
+ nr_linfo = prog->aux->nr_linfo;
+ if (!nr_linfo || !delta)
+ return;
+
+ linfo = prog->aux->linfo;
+
+ for (i = 0; i < nr_linfo; i++)
+ if (off < linfo[i].insn_off)
+ break;
+
+ /* Push all off < linfo[i].insn_off by delta */
+ for (; i < nr_linfo; i++)
+ linfo[i].insn_off += delta;
+}
+
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len)
{
@@ -347,6 +454,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
*/
BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false));
+ bpf_adj_linfo(prog_adj, off, insn_delta);
+
return prog_adj;
}
@@ -390,6 +499,8 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
{
const char *end = sym + KSYM_NAME_LEN;
+ const struct btf_type *type;
+ const char *func_name;
BUILD_BUG_ON(sizeof("bpf_prog_") +
sizeof(prog->tag) * 2 +
@@ -404,6 +515,16 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
+
+ /* prog->aux->name will be ignored if full btf name is available */
+ if (prog->aux->func_info_cnt) {
+ type = btf_type_by_id(prog->aux->btf,
+ prog->aux->func_info[prog->aux->func_idx].type_id);
+ func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
+ snprintf(sym, (size_t)(end - sym), "_%s", func_name);
+ return;
+ }
+
if (prog->aux->name[0])
snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
else
@@ -609,6 +730,16 @@ static void bpf_jit_uncharge_modmem(u32 pages)
atomic_long_sub(pages, &bpf_jit_current);
}
+void *__weak bpf_jit_alloc_exec(unsigned long size)
+{
+ return module_alloc(size);
+}
+
+void __weak bpf_jit_free_exec(void *addr)
+{
+ module_memfree(addr);
+}
+
struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
unsigned int alignment,
@@ -626,7 +757,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
if (bpf_jit_charge_modmem(pages))
return NULL;
- hdr = module_alloc(size);
+ hdr = bpf_jit_alloc_exec(size);
if (!hdr) {
bpf_jit_uncharge_modmem(pages);
return NULL;
@@ -650,7 +781,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
u32 pages = hdr->pages;
- module_memfree(hdr);
+ bpf_jit_free_exec(hdr);
bpf_jit_uncharge_modmem(pages);
}
@@ -672,6 +803,40 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
bpf_prog_unlock_free(fp);
}
+int bpf_jit_get_func_addr(const struct bpf_prog *prog,
+ const struct bpf_insn *insn, bool extra_pass,
+ u64 *func_addr, bool *func_addr_fixed)
+{
+ s16 off = insn->off;
+ s32 imm = insn->imm;
+ u8 *addr;
+
+ *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
+ if (!*func_addr_fixed) {
+ /* Place-holder address till the last pass has collected
+ * all addresses for JITed subprograms in which case we
+ * can pick them up from prog->aux.
+ */
+ if (!extra_pass)
+ addr = NULL;
+ else if (prog->aux->func &&
+ off >= 0 && off < prog->aux->func_cnt)
+ addr = (u8 *)prog->aux->func[off]->bpf_func;
+ else
+ return -EINVAL;
+ } else {
+ /* Address of a BPF helper call. Since part of the core
+ * kernel, it's always at a fixed location. __bpf_call_base
+ * and the helper with imm relative to it are both in core
+ * kernel.
+ */
+ addr = (u8 *)__bpf_call_base + imm;
+ }
+
+ *func_addr = (unsigned long)addr;
+ return 0;
+}
+
static int bpf_jit_blind_insn(const struct bpf_insn *from,
const struct bpf_insn *aux,
struct bpf_insn *to_buff)
@@ -875,32 +1040,34 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
#define BPF_INSN_MAP(INSN_2, INSN_3) \
/* 32 bit ALU operations. */ \
/* Register based. */ \
- INSN_3(ALU, ADD, X), \
- INSN_3(ALU, SUB, X), \
- INSN_3(ALU, AND, X), \
- INSN_3(ALU, OR, X), \
- INSN_3(ALU, LSH, X), \
- INSN_3(ALU, RSH, X), \
- INSN_3(ALU, XOR, X), \
- INSN_3(ALU, MUL, X), \
- INSN_3(ALU, MOV, X), \
- INSN_3(ALU, DIV, X), \
- INSN_3(ALU, MOD, X), \
+ INSN_3(ALU, ADD, X), \
+ INSN_3(ALU, SUB, X), \
+ INSN_3(ALU, AND, X), \
+ INSN_3(ALU, OR, X), \
+ INSN_3(ALU, LSH, X), \
+ INSN_3(ALU, RSH, X), \
+ INSN_3(ALU, XOR, X), \
+ INSN_3(ALU, MUL, X), \
+ INSN_3(ALU, MOV, X), \
+ INSN_3(ALU, ARSH, X), \
+ INSN_3(ALU, DIV, X), \
+ INSN_3(ALU, MOD, X), \
INSN_2(ALU, NEG), \
INSN_3(ALU, END, TO_BE), \
INSN_3(ALU, END, TO_LE), \
/* Immediate based. */ \
- INSN_3(ALU, ADD, K), \
- INSN_3(ALU, SUB, K), \
- INSN_3(ALU, AND, K), \
- INSN_3(ALU, OR, K), \
- INSN_3(ALU, LSH, K), \
- INSN_3(ALU, RSH, K), \
- INSN_3(ALU, XOR, K), \
- INSN_3(ALU, MUL, K), \
- INSN_3(ALU, MOV, K), \
- INSN_3(ALU, DIV, K), \
- INSN_3(ALU, MOD, K), \
+ INSN_3(ALU, ADD, K), \
+ INSN_3(ALU, SUB, K), \
+ INSN_3(ALU, AND, K), \
+ INSN_3(ALU, OR, K), \
+ INSN_3(ALU, LSH, K), \
+ INSN_3(ALU, RSH, K), \
+ INSN_3(ALU, XOR, K), \
+ INSN_3(ALU, MUL, K), \
+ INSN_3(ALU, MOV, K), \
+ INSN_3(ALU, ARSH, K), \
+ INSN_3(ALU, DIV, K), \
+ INSN_3(ALU, MOD, K), \
/* 64 bit ALU operations. */ \
/* Register based. */ \
INSN_3(ALU64, ADD, X), \
@@ -1079,6 +1246,12 @@ select_insn:
DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
insn++;
CONT;
+ ALU_ARSH_X:
+ DST = (u64) (u32) ((*(s32 *) &DST) >> SRC);
+ CONT;
+ ALU_ARSH_K:
+ DST = (u64) (u32) ((*(s32 *) &DST) >> IMM);
+ CONT;
ALU64_ARSH_X:
(*(s64 *) &DST) >>= SRC;
CONT;
@@ -1525,13 +1698,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* be JITed, but falls back to the interpreter.
*/
if (!bpf_prog_is_dev_bound(fp->aux)) {
+ *err = bpf_prog_alloc_jited_linfo(fp);
+ if (*err)
+ return fp;
+
fp = bpf_int_jit_compile(fp);
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
if (!fp->jited) {
+ bpf_prog_free_jited_linfo(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
*err = -ENOTSUPP;
return fp;
- }
#endif
+ } else {
+ bpf_prog_free_unused_jited_linfo(fp);
+ }
} else {
*err = bpf_prog_offload_compile(fp);
if (*err)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 2c1790288138..4b7c76765d9d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -23,7 +23,7 @@
#define HTAB_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
- BPF_F_RDONLY | BPF_F_WRONLY)
+ BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED)
struct bucket {
struct hlist_nulls_head head;
@@ -244,6 +244,7 @@ static int htab_map_alloc_check(union bpf_attr *attr)
*/
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+ bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
int numa_node = bpf_map_attr_numa_node(attr);
BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
@@ -257,6 +258,10 @@ static int htab_map_alloc_check(union bpf_attr *attr)
*/
return -EPERM;
+ if (zero_seed && !capable(CAP_SYS_ADMIN))
+ /* Guard against local DoS, and discourage production use. */
+ return -EPERM;
+
if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
/* reserved bits should not be used */
return -EINVAL;
@@ -373,7 +378,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (!htab->buckets)
goto free_htab;
- htab->hashrnd = get_random_int();
+ if (htab->map.map_flags & BPF_F_ZERO_SEED)
+ htab->hashrnd = 0;
+ else
+ htab->hashrnd = get_random_int();
+
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
raw_spin_lock_init(&htab->buckets[i].lock);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index c97a8f968638..b65017dead44 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -7,8 +7,7 @@
#include <linux/rbtree.h>
#include <linux/slab.h>
-DEFINE_PER_CPU(struct bpf_cgroup_storage*,
- bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
#ifdef CONFIG_CGROUP_BPF
@@ -139,7 +138,8 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
return -ENOENT;
new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
- map->value_size, __GFP_ZERO | GFP_USER,
+ map->value_size,
+ __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
map->numa_node);
if (!new)
return -ENOMEM;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 9058317ba9de..bfd4882e1106 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -168,20 +168,59 @@ static size_t longest_prefix_match(const struct lpm_trie *trie,
const struct lpm_trie_node *node,
const struct bpf_lpm_trie_key *key)
{
- size_t prefixlen = 0;
- size_t i;
+ u32 limit = min(node->prefixlen, key->prefixlen);
+ u32 prefixlen = 0, i = 0;
- for (i = 0; i < trie->data_size; i++) {
- size_t b;
+ BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32));
+ BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32));
- b = 8 - fls(node->data[i] ^ key->data[i]);
- prefixlen += b;
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)
- if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)
- return min(node->prefixlen, key->prefixlen);
+ /* data_size >= 16 has very small probability.
+ * We do not use a loop for optimal code generation.
+ */
+ if (trie->data_size >= 8) {
+ u64 diff = be64_to_cpu(*(__be64 *)node->data ^
+ *(__be64 *)key->data);
+
+ prefixlen = 64 - fls64(diff);
+ if (prefixlen >= limit)
+ return limit;
+ if (diff)
+ return prefixlen;
+ i = 8;
+ }
+#endif
+
+ while (trie->data_size >= i + 4) {
+ u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^
+ *(__be32 *)&key->data[i]);
+
+ prefixlen += 32 - fls(diff);
+ if (prefixlen >= limit)
+ return limit;
+ if (diff)
+ return prefixlen;
+ i += 4;
+ }
- if (b < 8)
- break;
+ if (trie->data_size >= i + 2) {
+ u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^
+ *(__be16 *)&key->data[i]);
+
+ prefixlen += 16 - fls(diff);
+ if (prefixlen >= limit)
+ return limit;
+ if (diff)
+ return prefixlen;
+ i += 2;
+ }
+
+ if (trie->data_size >= i + 1) {
+ prefixlen += 8 - fls(node->data[i] ^ key->data[i]);
+
+ if (prefixlen >= limit)
+ return limit;
}
return prefixlen;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 8e93c47f0779..54cf2b9c44a4 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -33,6 +33,7 @@
static DECLARE_RWSEM(bpf_devs_lock);
struct bpf_offload_dev {
+ const struct bpf_prog_offload_ops *ops;
struct list_head netdevs;
};
@@ -106,6 +107,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
err = -EINVAL;
goto err_unlock;
}
+ offload->offdev = ondev->offdev;
prog->aux->offload = offload;
list_add_tail(&offload->offloads, &ondev->progs);
dev_put(offload->netdev);
@@ -121,40 +123,20 @@ err_maybe_put:
return err;
}
-static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
- struct netdev_bpf *data)
+int bpf_prog_offload_verifier_prep(struct bpf_prog *prog)
{
- struct bpf_prog_offload *offload = prog->aux->offload;
- struct net_device *netdev;
-
- ASSERT_RTNL();
-
- if (!offload)
- return -ENODEV;
- netdev = offload->netdev;
-
- data->command = cmd;
-
- return netdev->netdev_ops->ndo_bpf(netdev, data);
-}
-
-int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
-{
- struct netdev_bpf data = {};
- int err;
-
- data.verifier.prog = env->prog;
+ struct bpf_prog_offload *offload;
+ int ret = -ENODEV;
- rtnl_lock();
- err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data);
- if (err)
- goto exit_unlock;
+ down_read(&bpf_devs_lock);
+ offload = prog->aux->offload;
+ if (offload) {
+ ret = offload->offdev->ops->prepare(prog);
+ offload->dev_state = !ret;
+ }
+ up_read(&bpf_devs_lock);
- env->prog->aux->offload->dev_ops = data.verifier.ops;
- env->prog->aux->offload->dev_state = true;
-exit_unlock:
- rtnl_unlock();
- return err;
+ return ret;
}
int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
@@ -166,7 +148,8 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
down_read(&bpf_devs_lock);
offload = env->prog->aux->offload;
if (offload)
- ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
+ ret = offload->offdev->ops->insn_hook(env, insn_idx,
+ prev_insn_idx);
up_read(&bpf_devs_lock);
return ret;
@@ -180,8 +163,8 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env)
down_read(&bpf_devs_lock);
offload = env->prog->aux->offload;
if (offload) {
- if (offload->dev_ops->finalize)
- ret = offload->dev_ops->finalize(env);
+ if (offload->offdev->ops->finalize)
+ ret = offload->offdev->ops->finalize(env);
else
ret = 0;
}
@@ -193,12 +176,9 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env)
static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
{
struct bpf_prog_offload *offload = prog->aux->offload;
- struct netdev_bpf data = {};
-
- data.offload.prog = prog;
if (offload->dev_state)
- WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
+ offload->offdev->ops->destroy(prog);
/* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
bpf_prog_free_id(prog, true);
@@ -210,24 +190,22 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
void bpf_prog_offload_destroy(struct bpf_prog *prog)
{
- rtnl_lock();
down_write(&bpf_devs_lock);
if (prog->aux->offload)
__bpf_prog_offload_destroy(prog);
up_write(&bpf_devs_lock);
- rtnl_unlock();
}
static int bpf_prog_offload_translate(struct bpf_prog *prog)
{
- struct netdev_bpf data = {};
- int ret;
-
- data.offload.prog = prog;
+ struct bpf_prog_offload *offload;
+ int ret = -ENODEV;
- rtnl_lock();
- ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
- rtnl_unlock();
+ down_read(&bpf_devs_lock);
+ offload = prog->aux->offload;
+ if (offload)
+ ret = offload->offdev->ops->translate(prog);
+ up_read(&bpf_devs_lock);
return ret;
}
@@ -655,7 +633,8 @@ unlock:
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
-struct bpf_offload_dev *bpf_offload_dev_create(void)
+struct bpf_offload_dev *
+bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops)
{
struct bpf_offload_dev *offdev;
int err;
@@ -673,6 +652,7 @@ struct bpf_offload_dev *bpf_offload_dev_create(void)
if (!offdev)
return ERR_PTR(-ENOMEM);
+ offdev->ops = ops;
INIT_LIST_HEAD(&offdev->netdevs);
return offdev;
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 8bbd72d3a121..b384ea9f3254 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -7,6 +7,7 @@
#include <linux/bpf.h>
#include <linux/list.h>
#include <linux/slab.h>
+#include <linux/capability.h>
#include "percpu_freelist.h"
#define QUEUE_STACK_CREATE_FLAG_MASK \
@@ -45,8 +46,12 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
/* Called from syscall */
static int queue_stack_map_alloc_check(union bpf_attr *attr)
{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 0 ||
+ attr->value_size == 0 ||
attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK)
return -EINVAL;
@@ -63,15 +68,10 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
{
int ret, numa_node = bpf_map_attr_numa_node(attr);
struct bpf_queue_stack *qs;
- u32 size, value_size;
- u64 queue_size, cost;
-
- size = attr->max_entries + 1;
- value_size = attr->value_size;
-
- queue_size = sizeof(*qs) + (u64) value_size * size;
+ u64 size, queue_size, cost;
- cost = queue_size;
+ size = (u64) attr->max_entries + 1;
+ cost = queue_size = sizeof(*qs) + size * attr->value_size;
if (cost >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-E2BIG);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cf5040fd5434..5745c7837621 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1213,6 +1213,9 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
bpf_prog_kallsyms_del_all(prog);
+ btf_put(prog->aux->btf);
+ kvfree(prog->aux->func_info);
+ bpf_prog_free_linfo(prog);
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
@@ -1437,9 +1440,9 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type
+#define BPF_PROG_LOAD_LAST_FIELD line_info_cnt
-static int bpf_prog_load(union bpf_attr *attr)
+static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog;
@@ -1450,9 +1453,14 @@ static int bpf_prog_load(union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
- if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
+ if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT))
return -EINVAL;
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+ (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
+ !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
/* copy eBPF program license from user space */
if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
sizeof(license) - 1) < 0)
@@ -1525,7 +1533,7 @@ static int bpf_prog_load(union bpf_attr *attr)
goto free_prog;
/* run eBPF verifier */
- err = bpf_check(&prog, attr);
+ err = bpf_check(&prog, attr, uattr);
if (err < 0)
goto free_used_maps;
@@ -1553,6 +1561,9 @@ static int bpf_prog_load(union bpf_attr *attr)
return err;
free_used_maps:
+ bpf_prog_free_linfo(prog);
+ kvfree(prog->aux->func_info);
+ btf_put(prog->aux->btf);
bpf_prog_kallsyms_del_subprogs(prog);
free_used_maps(prog->aux);
free_prog:
@@ -2032,6 +2043,37 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
return insns;
}
+static int set_info_rec_size(struct bpf_prog_info *info)
+{
+ /*
+ * Ensure info.*_rec_size is the same as kernel expected size
+ *
+ * or
+ *
+ * Only allow zero *_rec_size if both _rec_size and _cnt are
+ * zero. In this case, the kernel will set the expected
+ * _rec_size back to the info.
+ */
+
+ if ((info->nr_func_info || info->func_info_rec_size) &&
+ info->func_info_rec_size != sizeof(struct bpf_func_info))
+ return -EINVAL;
+
+ if ((info->nr_line_info || info->line_info_rec_size) &&
+ info->line_info_rec_size != sizeof(struct bpf_line_info))
+ return -EINVAL;
+
+ if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
+ info->jited_line_info_rec_size != sizeof(__u64))
+ return -EINVAL;
+
+ info->func_info_rec_size = sizeof(struct bpf_func_info);
+ info->line_info_rec_size = sizeof(struct bpf_line_info);
+ info->jited_line_info_rec_size = sizeof(__u64);
+
+ return 0;
+}
+
static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -2074,11 +2116,18 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
return -EFAULT;
}
+ err = set_info_rec_size(&info);
+ if (err)
+ return err;
+
if (!capable(CAP_SYS_ADMIN)) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
info.nr_jited_ksyms = 0;
info.nr_jited_func_lens = 0;
+ info.nr_func_info = 0;
+ info.nr_line_info = 0;
+ info.nr_jited_line_info = 0;
goto done;
}
@@ -2160,7 +2209,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
ulen = info.nr_jited_ksyms;
info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
- if (info.nr_jited_ksyms && ulen) {
+ if (ulen) {
if (bpf_dump_raw_ok()) {
unsigned long ksym_addr;
u64 __user *user_ksyms;
@@ -2191,7 +2240,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
ulen = info.nr_jited_func_lens;
info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
- if (info.nr_jited_func_lens && ulen) {
+ if (ulen) {
if (bpf_dump_raw_ok()) {
u32 __user *user_lens;
u32 func_len, i;
@@ -2216,6 +2265,63 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
}
}
+ if (prog->aux->btf)
+ info.btf_id = btf_id(prog->aux->btf);
+
+ ulen = info.nr_func_info;
+ info.nr_func_info = prog->aux->func_info_cnt;
+ if (info.nr_func_info && ulen) {
+ if (bpf_dump_raw_ok()) {
+ char __user *user_finfo;
+
+ user_finfo = u64_to_user_ptr(info.func_info);
+ ulen = min_t(u32, info.nr_func_info, ulen);
+ if (copy_to_user(user_finfo, prog->aux->func_info,
+ info.func_info_rec_size * ulen))
+ return -EFAULT;
+ } else {
+ info.func_info = 0;
+ }
+ }
+
+ ulen = info.nr_line_info;
+ info.nr_line_info = prog->aux->nr_linfo;
+ if (info.nr_line_info && ulen) {
+ if (bpf_dump_raw_ok()) {
+ __u8 __user *user_linfo;
+
+ user_linfo = u64_to_user_ptr(info.line_info);
+ ulen = min_t(u32, info.nr_line_info, ulen);
+ if (copy_to_user(user_linfo, prog->aux->linfo,
+ info.line_info_rec_size * ulen))
+ return -EFAULT;
+ } else {
+ info.line_info = 0;
+ }
+ }
+
+ ulen = info.nr_jited_line_info;
+ if (prog->aux->jited_linfo)
+ info.nr_jited_line_info = prog->aux->nr_linfo;
+ else
+ info.nr_jited_line_info = 0;
+ if (info.nr_jited_line_info && ulen) {
+ if (bpf_dump_raw_ok()) {
+ __u64 __user *user_linfo;
+ u32 i;
+
+ user_linfo = u64_to_user_ptr(info.jited_line_info);
+ ulen = min_t(u32, info.nr_jited_line_info, ulen);
+ for (i = 0; i < ulen; i++) {
+ if (put_user((__u64)(long)prog->aux->jited_linfo[i],
+ &user_linfo[i]))
+ return -EFAULT;
+ }
+ } else {
+ info.jited_line_info = 0;
+ }
+ }
+
done:
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
@@ -2501,7 +2607,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = map_get_next_key(&attr);
break;
case BPF_PROG_LOAD:
- err = bpf_prog_load(&attr);
+ err = bpf_prog_load(&attr, uattr);
break;
case BPF_OBJ_PIN:
err = bpf_obj_pin(&attr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1971ca325fb4..8b511a4fe84a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11,10 +11,12 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
+#include <uapi/linux/btf.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/bpf_verifier.h>
#include <linux/filter.h>
#include <net/netlink.h>
@@ -175,6 +177,7 @@ struct bpf_verifier_stack_elem {
#define BPF_COMPLEXITY_LIMIT_INSNS 131072
#define BPF_COMPLEXITY_LIMIT_STACK 1024
+#define BPF_COMPLEXITY_LIMIT_STATES 64
#define BPF_MAP_PTR_UNPRIV 1UL
#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \
@@ -1455,6 +1458,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
verbose(env, "R%d offset is outside of the packet\n", regno);
return err;
}
+
+ /* __check_packet_access has made sure "off + size - 1" is within u16.
+ * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
+ * otherwise find_good_pkt_pointers would have refused to set range info
+ * that __check_packet_access would have rejected this pkt access.
+ * Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
+ */
+ env->prog->aux->max_pkt_offset =
+ max_t(u32, env->prog->aux->max_pkt_offset,
+ off + reg->umax_value + size - 1);
+
return err;
}
@@ -3570,12 +3584,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
if (BPF_SRC(insn->code) == BPF_X) {
+ struct bpf_reg_state *src_reg = regs + insn->src_reg;
+ struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
+
if (BPF_CLASS(insn->code) == BPF_ALU64) {
/* case: R1 = R2
* copy register state to dest reg
*/
- regs[insn->dst_reg] = regs[insn->src_reg];
- regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;
+ *dst_reg = *src_reg;
+ dst_reg->live |= REG_LIVE_WRITTEN;
} else {
/* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
@@ -3583,9 +3600,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
"R%d partial copy of pointer\n",
insn->src_reg);
return -EACCES;
+ } else if (src_reg->type == SCALAR_VALUE) {
+ *dst_reg = *src_reg;
+ dst_reg->live |= REG_LIVE_WRITTEN;
+ } else {
+ mark_reg_unknown(env, regs,
+ insn->dst_reg);
}
- mark_reg_unknown(env, regs, insn->dst_reg);
- coerce_reg_to_size(&regs[insn->dst_reg], 4);
+ coerce_reg_to_size(dst_reg, 4);
}
} else {
/* case: R = imm
@@ -3636,11 +3658,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
- verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
- return -EINVAL;
- }
-
if ((opcode == BPF_LSH || opcode == BPF_RSH ||
opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
@@ -3751,6 +3768,79 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
}
}
+/* compute branch direction of the expression "if (reg opcode val) goto target;"
+ * and return:
+ * 1 - branch will be taken and "goto target" will be executed
+ * 0 - branch will not be taken and fall-through to next insn
+ * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10]
+ */
+static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
+{
+ if (__is_pointer_value(false, reg))
+ return -1;
+
+ switch (opcode) {
+ case BPF_JEQ:
+ if (tnum_is_const(reg->var_off))
+ return !!tnum_equals_const(reg->var_off, val);
+ break;
+ case BPF_JNE:
+ if (tnum_is_const(reg->var_off))
+ return !tnum_equals_const(reg->var_off, val);
+ break;
+ case BPF_JGT:
+ if (reg->umin_value > val)
+ return 1;
+ else if (reg->umax_value <= val)
+ return 0;
+ break;
+ case BPF_JSGT:
+ if (reg->smin_value > (s64)val)
+ return 1;
+ else if (reg->smax_value < (s64)val)
+ return 0;
+ break;
+ case BPF_JLT:
+ if (reg->umax_value < val)
+ return 1;
+ else if (reg->umin_value >= val)
+ return 0;
+ break;
+ case BPF_JSLT:
+ if (reg->smax_value < (s64)val)
+ return 1;
+ else if (reg->smin_value >= (s64)val)
+ return 0;
+ break;
+ case BPF_JGE:
+ if (reg->umin_value >= val)
+ return 1;
+ else if (reg->umax_value < val)
+ return 0;
+ break;
+ case BPF_JSGE:
+ if (reg->smin_value >= (s64)val)
+ return 1;
+ else if (reg->smax_value < (s64)val)
+ return 0;
+ break;
+ case BPF_JLE:
+ if (reg->umax_value <= val)
+ return 1;
+ else if (reg->umin_value > val)
+ return 0;
+ break;
+ case BPF_JSLE:
+ if (reg->smax_value <= (s64)val)
+ return 1;
+ else if (reg->smin_value > (s64)val)
+ return 0;
+ break;
+ }
+
+ return -1;
+}
+
/* Adjusts the register min/max values in the case that the dst_reg is the
* variable register that we are working on, and src_reg is a constant or we're
* simply doing a BPF_K check.
@@ -4152,21 +4242,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
dst_reg = &regs[insn->dst_reg];
- /* detect if R == 0 where R was initialized to zero earlier */
- if (BPF_SRC(insn->code) == BPF_K &&
- (opcode == BPF_JEQ || opcode == BPF_JNE) &&
- dst_reg->type == SCALAR_VALUE &&
- tnum_is_const(dst_reg->var_off)) {
- if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) ||
- (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) {
- /* if (imm == imm) goto pc+off;
- * only follow the goto, ignore fall-through
- */
+ if (BPF_SRC(insn->code) == BPF_K) {
+ int pred = is_branch_taken(dst_reg, insn->imm, opcode);
+
+ if (pred == 1) {
+ /* only follow the goto, ignore fall-through */
*insn_idx += insn->off;
return 0;
- } else {
- /* if (imm != imm) goto pc+off;
- * only follow fall-through branch, since
+ } else if (pred == 0) {
+ /* only follow fall-through branch, since
* that's where the program will go
*/
return 0;
@@ -4628,6 +4712,269 @@ err_free:
return ret;
}
+/* The minimum supported BTF func info size */
+#define MIN_BPF_FUNCINFO_SIZE 8
+#define MAX_FUNCINFO_REC_SIZE 252
+
+static int check_btf_func(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ u32 i, nfuncs, urec_size, min_size, prev_offset;
+ u32 krec_size = sizeof(struct bpf_func_info);
+ struct bpf_func_info *krecord;
+ const struct btf_type *type;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ void __user *urecord;
+ int ret = 0;
+
+ nfuncs = attr->func_info_cnt;
+ if (!nfuncs)
+ return 0;
+
+ if (nfuncs != env->subprog_cnt) {
+ verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
+ return -EINVAL;
+ }
+
+ urec_size = attr->func_info_rec_size;
+ if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
+ urec_size > MAX_FUNCINFO_REC_SIZE ||
+ urec_size % sizeof(u32)) {
+ verbose(env, "invalid func info rec size %u\n", urec_size);
+ return -EINVAL;
+ }
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ urecord = u64_to_user_ptr(attr->func_info);
+ min_size = min_t(u32, krec_size, urec_size);
+
+ krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!krecord)
+ return -ENOMEM;
+
+ for (i = 0; i < nfuncs; i++) {
+ ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
+ if (ret) {
+ if (ret == -E2BIG) {
+ verbose(env, "nonzero tailing record in func info");
+ /* set the size kernel expects so loader can zero
+ * out the rest of the record.
+ */
+ if (put_user(min_size, &uattr->func_info_rec_size))
+ ret = -EFAULT;
+ }
+ goto err_free;
+ }
+
+ if (copy_from_user(&krecord[i], urecord, min_size)) {
+ ret = -EFAULT;
+ goto err_free;
+ }
+
+ /* check insn_off */
+ if (i == 0) {
+ if (krecord[i].insn_off) {
+ verbose(env,
+ "nonzero insn_off %u for the first func info record",
+ krecord[i].insn_off);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ } else if (krecord[i].insn_off <= prev_offset) {
+ verbose(env,
+ "same or smaller insn offset (%u) than previous func info record (%u)",
+ krecord[i].insn_off, prev_offset);
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ if (env->subprog_info[i].start != krecord[i].insn_off) {
+ verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ /* check type_id */
+ type = btf_type_by_id(btf, krecord[i].type_id);
+ if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) {
+ verbose(env, "invalid type id %d in func info",
+ krecord[i].type_id);
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ prev_offset = krecord[i].insn_off;
+ urecord += urec_size;
+ }
+
+ prog->aux->func_info = krecord;
+ prog->aux->func_info_cnt = nfuncs;
+ return 0;
+
+err_free:
+ kvfree(krecord);
+ return ret;
+}
+
+static void adjust_btf_func(struct bpf_verifier_env *env)
+{
+ int i;
+
+ if (!env->prog->aux->func_info)
+ return;
+
+ for (i = 0; i < env->subprog_cnt; i++)
+ env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start;
+}
+
+#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \
+ sizeof(((struct bpf_line_info *)(0))->line_col))
+#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
+
+static int check_btf_line(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
+ struct bpf_subprog_info *sub;
+ struct bpf_line_info *linfo;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ void __user *ulinfo;
+ int err;
+
+ nr_linfo = attr->line_info_cnt;
+ if (!nr_linfo)
+ return 0;
+
+ rec_size = attr->line_info_rec_size;
+ if (rec_size < MIN_BPF_LINEINFO_SIZE ||
+ rec_size > MAX_LINEINFO_REC_SIZE ||
+ rec_size & (sizeof(u32) - 1))
+ return -EINVAL;
+
+ /* Need to zero it in case the userspace may
+ * pass in a smaller bpf_line_info object.
+ */
+ linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!linfo)
+ return -ENOMEM;
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ s = 0;
+ sub = env->subprog_info;
+ ulinfo = u64_to_user_ptr(attr->line_info);
+ expected_size = sizeof(struct bpf_line_info);
+ ncopy = min_t(u32, expected_size, rec_size);
+ for (i = 0; i < nr_linfo; i++) {
+ err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size);
+ if (err) {
+ if (err == -E2BIG) {
+ verbose(env, "nonzero tailing record in line_info");
+ if (put_user(expected_size,
+ &uattr->line_info_rec_size))
+ err = -EFAULT;
+ }
+ goto err_free;
+ }
+
+ if (copy_from_user(&linfo[i], ulinfo, ncopy)) {
+ err = -EFAULT;
+ goto err_free;
+ }
+
+ /*
+ * Check insn_off to ensure
+ * 1) strictly increasing AND
+ * 2) bounded by prog->len
+ *
+ * The linfo[0].insn_off == 0 check logically falls into
+ * the later "missing bpf_line_info for func..." case
+ * because the first linfo[0].insn_off must be the
+ * first sub also and the first sub must have
+ * subprog_info[0].start == 0.
+ */
+ if ((i && linfo[i].insn_off <= prev_offset) ||
+ linfo[i].insn_off >= prog->len) {
+ verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n",
+ i, linfo[i].insn_off, prev_offset,
+ prog->len);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (!btf_name_offset_valid(btf, linfo[i].line_off) ||
+ !btf_name_offset_valid(btf, linfo[i].file_name_off)) {
+ verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (s != env->subprog_cnt) {
+ if (linfo[i].insn_off == sub[s].start) {
+ sub[s].linfo_idx = i;
+ s++;
+ } else if (sub[s].start < linfo[i].insn_off) {
+ verbose(env, "missing bpf_line_info for func#%u\n", s);
+ err = -EINVAL;
+ goto err_free;
+ }
+ }
+
+ prev_offset = linfo[i].insn_off;
+ ulinfo += rec_size;
+ }
+
+ if (s != env->subprog_cnt) {
+ verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n",
+ env->subprog_cnt - s, s);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ prog->aux->linfo = linfo;
+ prog->aux->nr_linfo = nr_linfo;
+
+ return 0;
+
+err_free:
+ kvfree(linfo);
+ return err;
+}
+
+static int check_btf_info(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct btf *btf;
+ int err;
+
+ if (!attr->func_info_cnt && !attr->line_info_cnt)
+ return 0;
+
+ btf = btf_get_by_fd(attr->prog_btf_fd);
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ env->prog->aux->btf = btf;
+
+ err = check_btf_func(env, attr, uattr);
+ if (err)
+ return err;
+
+ err = check_btf_line(env, attr, uattr);
+ if (err)
+ return err;
+
+ return 0;
+}
+
/* check %cur's range satisfies %old's */
static bool range_within(struct bpf_reg_state *old,
struct bpf_reg_state *cur)
@@ -4980,7 +5327,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl;
struct bpf_verifier_state *cur = env->cur_state, *new;
- int i, j, err;
+ int i, j, err, states_cnt = 0;
sl = env->explored_states[insn_idx];
if (!sl)
@@ -5007,8 +5354,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
return 1;
}
sl = sl->next;
+ states_cnt++;
}
+ if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
+ return 0;
+
/* there were no equivalent states, remember current one.
* technically the current state is not proven to be safe yet,
* but it will either reach outer most bpf_exit (which means it's safe)
@@ -5148,6 +5499,9 @@ static int do_check(struct bpf_verifier_env *env)
goto process_bpf_exit;
}
+ if (signal_pending(current))
+ return -EAGAIN;
+
if (need_resched())
cond_resched();
@@ -5650,7 +6004,7 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
return;
/* NOTE: fake 'exit' subprog should be updated as well. */
for (i = 0; i <= env->subprog_cnt; i++) {
- if (env->subprog_info[i].start < off)
+ if (env->subprog_info[i].start <= off)
continue;
env->subprog_info[i].start += len - 1;
}
@@ -5707,10 +6061,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
int i, cnt, size, ctx_field_size, delta = 0;
const int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16], *insn;
+ u32 target_size, size_default, off;
struct bpf_prog *new_prog;
enum bpf_access_type type;
bool is_narrower_load;
- u32 target_size;
if (ops->gen_prologue || env->seen_direct_write) {
if (!ops->gen_prologue) {
@@ -5803,9 +6157,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
* we will apply proper mask to the result.
*/
is_narrower_load = size < ctx_field_size;
+ size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
+ off = insn->off;
if (is_narrower_load) {
- u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
- u32 off = insn->off;
u8 size_code;
if (type == BPF_WRITE) {
@@ -5833,12 +6187,23 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
if (is_narrower_load && size < target_size) {
- if (ctx_field_size <= 4)
+ u8 shift = (off & (size_default - 1)) * 8;
+
+ if (ctx_field_size <= 4) {
+ if (shift)
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
+ insn->dst_reg,
+ shift);
insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
(1 << size * 8) - 1);
- else
+ } else {
+ if (shift)
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
+ insn->dst_reg,
+ shift);
insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
(1 << size * 8) - 1);
+ }
}
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
@@ -5861,7 +6226,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
int i, j, subprog_start, subprog_end = 0, len, subprog;
struct bpf_insn *insn;
void *old_bpf_func;
- int err = -ENOMEM;
+ int err;
if (env->subprog_cnt <= 1)
return 0;
@@ -5892,6 +6257,11 @@ static int jit_subprogs(struct bpf_verifier_env *env)
insn->imm = 1;
}
+ err = bpf_prog_alloc_jited_linfo(prog);
+ if (err)
+ goto out_undo_insn;
+
+ err = -ENOMEM;
func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL);
if (!func)
goto out_undo_insn;
@@ -5911,12 +6281,21 @@ static int jit_subprogs(struct bpf_verifier_env *env)
if (bpf_prog_calc_tag(func[i]))
goto out_free;
func[i]->is_func = 1;
+ func[i]->aux->func_idx = i;
+ /* the btf and func_info will be freed only at prog->aux */
+ func[i]->aux->btf = prog->aux->btf;
+ func[i]->aux->func_info = prog->aux->func_info;
+
/* Use bpf_prog_F_tag to indicate functions in stack traces.
* Long term would need debug info to populate names
*/
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
+ func[i]->aux->linfo = prog->aux->linfo;
+ func[i]->aux->nr_linfo = prog->aux->nr_linfo;
+ func[i]->aux->jited_linfo = prog->aux->jited_linfo;
+ func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
func[i] = bpf_int_jit_compile(func[i]);
if (!func[i]->jited) {
err = -ENOTSUPP;
@@ -5990,6 +6369,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->bpf_func = func[0]->bpf_func;
prog->aux->func = func;
prog->aux->func_cnt = env->subprog_cnt;
+ bpf_prog_free_unused_jited_linfo(prog);
return 0;
out_free:
for (i = 0; i < env->subprog_cnt; i++)
@@ -6006,6 +6386,7 @@ out_undo_insn:
insn->off = 0;
insn->imm = env->insn_aux_data[i].call_imm;
}
+ bpf_prog_free_jited_linfo(prog);
return err;
}
@@ -6138,6 +6519,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*/
prog->cb_access = 1;
env->prog->aux->stack_depth = MAX_BPF_STACK;
+ env->prog->aux->max_pkt_offset = MAX_PACKET_OFF;
/* mark bpf_tail_call as different opcode to avoid
* conditional branch in the interpeter for every normal
@@ -6302,7 +6684,8 @@ static void free_states(struct bpf_verifier_env *env)
kfree(env->explored_states);
}
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
struct bpf_verifier_env *env;
struct bpf_verifier_log *log;
@@ -6350,13 +6733,15 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
+ if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
+ env->strict_alignment = false;
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
if (bpf_prog_is_dev_bound(env->prog->aux)) {
- ret = bpf_prog_offload_verifier_prep(env);
+ ret = bpf_prog_offload_verifier_prep(env->prog);
if (ret)
goto skip_full_check;
}
@@ -6374,6 +6759,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (ret < 0)
goto skip_full_check;
+ ret = check_btf_info(env, attr, uattr);
+ if (ret < 0)
+ goto skip_full_check;
+
ret = do_check(env);
if (env->cur_state) {
free_verifier_state(env->cur_state, true);
@@ -6431,6 +6820,9 @@ skip_full_check:
convert_pseudo_ld_imm64(env);
}
+ if (ret == 0)
+ adjust_btf_func(env);
+
err_release_maps:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3c7f3b4c453c..91d5c38eb7e5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,6 +10,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/task.h>
+#include <linux/sched/smt.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/oom.h>
@@ -367,6 +368,12 @@ static void lockdep_release_cpus_lock(void)
#endif /* CONFIG_HOTPLUG_CPU */
+/*
+ * Architectures that need SMT-specific errata handling during SMT hotplug
+ * should override this.
+ */
+void __weak arch_smt_update(void) { }
+
#ifdef CONFIG_HOTPLUG_SMT
enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
EXPORT_SYMBOL_GPL(cpu_smt_control);
@@ -1011,6 +1018,7 @@ out:
* concurrent CPU hotplug via cpu_add_remove_lock.
*/
lockup_detector_cleanup();
+ arch_smt_update();
return ret;
}
@@ -1139,6 +1147,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
ret = cpuhp_up_callbacks(cpu, st, target);
out:
cpus_write_unlock();
+ arch_smt_update();
return ret;
}
@@ -2055,12 +2064,6 @@ static void cpuhp_online_cpu_device(unsigned int cpu)
kobject_uevent(&dev->kobj, KOBJ_ONLINE);
}
-/*
- * Architectures that need SMT-specific errata handling during SMT hotplug
- * should override this.
- */
-void __weak arch_smt_update(void) { };
-
static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
{
int cpu, ret = 0;
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 6ad4a9fcbd6f..7921ae4fca8d 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -179,14 +179,14 @@ kdb_bt(int argc, const char **argv)
kdb_printf("no process for cpu %ld\n", cpu);
return 0;
}
- sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+ sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu));
kdb_parse(buf);
return 0;
}
kdb_printf("btc: cpu status: ");
kdb_parse("cpu\n");
for_each_online_cpu(cpu) {
- sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+ sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu));
kdb_parse(buf);
touch_nmi_watchdog();
}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index ed5d34925ad0..6a4b41484afe 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -216,7 +216,7 @@ static char *kdb_read(char *buffer, size_t bufsize)
int count;
int i;
int diag, dtab_count;
- int key;
+ int key, buf_size, ret;
diag = kdbgetintenv("DTABCOUNT", &dtab_count);
@@ -336,9 +336,8 @@ poll_again:
else
p_tmp = tmpbuffer;
len = strlen(p_tmp);
- count = kallsyms_symbol_complete(p_tmp,
- sizeof(tmpbuffer) -
- (p_tmp - tmpbuffer));
+ buf_size = sizeof(tmpbuffer) - (p_tmp - tmpbuffer);
+ count = kallsyms_symbol_complete(p_tmp, buf_size);
if (tab == 2 && count > 0) {
kdb_printf("\n%d symbols are found.", count);
if (count > dtab_count) {
@@ -350,9 +349,13 @@ poll_again:
}
kdb_printf("\n");
for (i = 0; i < count; i++) {
- if (WARN_ON(!kallsyms_symbol_next(p_tmp, i)))
+ ret = kallsyms_symbol_next(p_tmp, i, buf_size);
+ if (WARN_ON(!ret))
break;
- kdb_printf("%s ", p_tmp);
+ if (ret != -E2BIG)
+ kdb_printf("%s ", p_tmp);
+ else
+ kdb_printf("%s... ", p_tmp);
*(p_tmp + len) = '\0';
}
if (i >= dtab_count)
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 118527aa60ea..750497b0003a 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -173,11 +173,11 @@ int kdb_get_kbd_char(void)
case KT_LATIN:
if (isprint(keychar))
break; /* printable characters */
- /* drop through */
+ /* fall through */
case KT_SPEC:
if (keychar == K_ENTER)
break;
- /* drop through */
+ /* fall through */
default:
return -1; /* ignore unprintables */
}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index bb4fe4e1a601..d72b32c66f7d 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1192,7 +1192,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
if (reason == KDB_REASON_DEBUG) {
/* special case below */
} else {
- kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
+ kdb_printf("\nEntering kdb (current=0x%px, pid %d) ",
kdb_current, kdb_current ? kdb_current->pid : 0);
#if defined(CONFIG_SMP)
kdb_printf("on processor %d ", raw_smp_processor_id());
@@ -1208,7 +1208,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
*/
switch (db_result) {
case KDB_DB_BPT:
- kdb_printf("\nEntering kdb (0x%p, pid %d) ",
+ kdb_printf("\nEntering kdb (0x%px, pid %d) ",
kdb_current, kdb_current->pid);
#if defined(CONFIG_SMP)
kdb_printf("on processor %d ", raw_smp_processor_id());
@@ -1493,6 +1493,7 @@ static void kdb_md_line(const char *fmtstr, unsigned long addr,
char cbuf[32];
char *c = cbuf;
int i;
+ int j;
unsigned long word;
memset(cbuf, '\0', sizeof(cbuf));
@@ -1538,25 +1539,9 @@ static void kdb_md_line(const char *fmtstr, unsigned long addr,
wc.word = word;
#define printable_char(c) \
({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; })
- switch (bytesperword) {
- case 8:
+ for (j = 0; j < bytesperword; j++)
*c++ = printable_char(*cp++);
- *c++ = printable_char(*cp++);
- *c++ = printable_char(*cp++);
- *c++ = printable_char(*cp++);
- addr += 4;
- case 4:
- *c++ = printable_char(*cp++);
- *c++ = printable_char(*cp++);
- addr += 2;
- case 2:
- *c++ = printable_char(*cp++);
- addr++;
- case 1:
- *c++ = printable_char(*cp++);
- addr++;
- break;
- }
+ addr += bytesperword;
#undef printable_char
}
}
@@ -2048,7 +2033,7 @@ static int kdb_lsmod(int argc, const char **argv)
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- kdb_printf("%-20s%8u 0x%p ", mod->name,
+ kdb_printf("%-20s%8u 0x%px ", mod->name,
mod->core_layout.size, (void *)mod);
#ifdef CONFIG_MODULE_UNLOAD
kdb_printf("%4d ", module_refcount(mod));
@@ -2059,7 +2044,7 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf(" (Loading)");
else
kdb_printf(" (Live)");
- kdb_printf(" 0x%p", mod->core_layout.base);
+ kdb_printf(" 0x%px", mod->core_layout.base);
#ifdef CONFIG_MODULE_UNLOAD
{
@@ -2341,7 +2326,7 @@ void kdb_ps1(const struct task_struct *p)
return;
cpu = kdb_process_cpu(p);
- kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n",
+ kdb_printf("0x%px %8d %8d %d %4d %c 0x%px %c%s\n",
(void *)p, p->pid, p->parent->pid,
kdb_task_has_cpu(p), kdb_process_cpu(p),
kdb_task_state_char(p),
@@ -2354,7 +2339,7 @@ void kdb_ps1(const struct task_struct *p)
} else {
if (KDB_TSK(cpu) != p)
kdb_printf(" Error: does not match running "
- "process table (0x%p)\n", KDB_TSK(cpu));
+ "process table (0x%px)\n", KDB_TSK(cpu));
}
}
}
@@ -2687,7 +2672,7 @@ int kdb_register_flags(char *cmd,
for_each_kdbcmd(kp, i) {
if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
kdb_printf("Duplicate kdb command registered: "
- "%s, func %p help %s\n", cmd, func, help);
+ "%s, func %px help %s\n", cmd, func, help);
return 1;
}
}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 1e5a502ba4a7..2118d8258b7c 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -83,7 +83,7 @@ typedef struct __ksymtab {
unsigned long sym_start;
unsigned long sym_end;
} kdb_symtab_t;
-extern int kallsyms_symbol_next(char *prefix_name, int flag);
+extern int kallsyms_symbol_next(char *prefix_name, int flag, int buf_size);
extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
/* Exported Symbols for kernel loadable modules to use. */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 990b3cc526c8..50bf9b119bad 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -40,7 +40,7 @@
int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
{
if (KDB_DEBUG(AR))
- kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
+ kdb_printf("kdbgetsymval: symname=%s, symtab=%px\n", symname,
symtab);
memset(symtab, 0, sizeof(*symtab));
symtab->sym_start = kallsyms_lookup_name(symname);
@@ -88,7 +88,7 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
char *knt1 = NULL;
if (KDB_DEBUG(AR))
- kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
+ kdb_printf("kdbnearsym: addr=0x%lx, symtab=%px\n", addr, symtab);
memset(symtab, 0, sizeof(*symtab));
if (addr < 4096)
@@ -149,7 +149,7 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
symtab->mod_name = "kernel";
if (KDB_DEBUG(AR))
kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
- "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
+ "symtab->mod_name=%px, symtab->sym_name=%px (%s)\n", ret,
symtab->sym_start, symtab->mod_name, symtab->sym_name,
symtab->sym_name);
@@ -221,11 +221,13 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len)
* Parameters:
* prefix_name prefix of a symbol name to lookup
* flag 0 means search from the head, 1 means continue search.
+ * buf_size maximum length that can be written to prefix_name
+ * buffer
* Returns:
* 1 if a symbol matches the given prefix.
* 0 if no string found
*/
-int kallsyms_symbol_next(char *prefix_name, int flag)
+int kallsyms_symbol_next(char *prefix_name, int flag, int buf_size)
{
int prefix_len = strlen(prefix_name);
static loff_t pos;
@@ -235,10 +237,8 @@ int kallsyms_symbol_next(char *prefix_name, int flag)
pos = 0;
while ((name = kdb_walk_kallsyms(&pos))) {
- if (strncmp(name, prefix_name, prefix_len) == 0) {
- strncpy(prefix_name, name, strlen(name)+1);
- return 1;
- }
+ if (!strncmp(name, prefix_name, prefix_len))
+ return strscpy(prefix_name, name, buf_size);
}
return 0;
}
@@ -432,7 +432,7 @@ int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
*word = w8;
break;
}
- /* drop through */
+ /* fall through */
default:
diag = KDB_BADWIDTH;
kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
@@ -481,7 +481,7 @@ int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
*word = w8;
break;
}
- /* drop through */
+ /* fall through */
default:
diag = KDB_BADWIDTH;
kdb_printf("kdb_getword: bad width %ld\n", (long) size);
@@ -525,7 +525,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
diag = kdb_putarea(addr, w8);
break;
}
- /* drop through */
+ /* fall through */
default:
diag = KDB_BADWIDTH;
kdb_printf("kdb_putword: bad width %ld\n", (long) size);
@@ -887,13 +887,13 @@ void debug_kusage(void)
__func__, dah_first);
if (dah_first) {
h_used = (struct debug_alloc_header *)debug_alloc_pool;
- kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
+ kdb_printf("%s: h_used %px size %d\n", __func__, h_used,
h_used->size);
}
do {
h_used = (struct debug_alloc_header *)
((char *)h_free + dah_overhead + h_free->size);
- kdb_printf("%s: h_used %p size %d caller %p\n",
+ kdb_printf("%s: h_used %px size %d caller %px\n",
__func__, h_used, h_used->size, h_used->caller);
h_free = (struct debug_alloc_header *)
(debug_alloc_pool + h_free->next);
@@ -902,7 +902,7 @@ void debug_kusage(void)
((char *)h_free + dah_overhead + h_free->size);
if ((char *)h_used - debug_alloc_pool !=
sizeof(debug_alloc_pool_aligned))
- kdb_printf("%s: h_used %p size %d caller %p\n",
+ kdb_printf("%s: h_used %px size %d caller %px\n",
__func__, h_used, h_used->size, h_used->caller);
out:
spin_unlock(&dap_lock);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 5731daa09a32..045930e32c0e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -679,7 +679,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
}
if (!dev_is_dma_coherent(dev) &&
- (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0)
+ (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0 &&
+ dev_addr != DIRECT_MAPPING_ERROR)
arch_sync_dma_for_device(dev, phys, size, dir);
return dev_addr;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 96d4bee83489..abbd8da9ac21 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -572,7 +572,9 @@ static void put_uprobe(struct uprobe *uprobe)
* gets called, we don't get a chance to remove uprobe from
* delayed_uprobe_list from remove_breakpoint(). Do it here.
*/
+ mutex_lock(&delayed_uprobe_lock);
delayed_uprobe_remove(uprobe, NULL);
+ mutex_unlock(&delayed_uprobe_lock);
kfree(uprobe);
}
}
@@ -829,7 +831,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
BUG_ON((uprobe->offset & ~PAGE_MASK) +
UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
- smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
+ smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
set_bit(UPROBE_COPY_INSN, &uprobe->flags);
out:
@@ -2178,10 +2180,18 @@ static void handle_swbp(struct pt_regs *regs)
* After we hit the bp, _unregister + _register can install the
* new and not-yet-analyzed uprobe at the same address, restart.
*/
- smp_rmb(); /* pairs with wmb() in install_breakpoint() */
if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
goto out;
+ /*
+ * Pairs with the smp_wmb() in prepare_uprobe().
+ *
+ * Guarantees that if we see the UPROBE_COPY_INSN bit set, then
+ * we must also see the stores to &uprobe->arch performed by the
+ * prepare_uprobe() call.
+ */
+ smp_rmb();
+
/* Tracing handlers use ->utask to communicate with fetch methods */
if (!get_utask())
goto out;
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 3ebd09efe72a..97959d7b77e2 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -56,7 +56,7 @@ struct kcov {
struct task_struct *t;
};
-static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
+static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
{
unsigned int mode;
@@ -78,7 +78,7 @@ static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
return mode == needed_mode;
}
-static unsigned long canonicalize_ip(unsigned long ip)
+static notrace unsigned long canonicalize_ip(unsigned long ip)
{
#ifdef CONFIG_RANDOMIZE_BASE
ip -= kaslr_offset();
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 80b34dffdfb9..c2cee9db5204 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -261,9 +261,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
{
- if (mode & PTRACE_MODE_SCHED)
- return false;
-
if (mode & PTRACE_MODE_NOAUDIT)
return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
else
@@ -331,16 +328,9 @@ ok:
!ptrace_has_cap(mm->user_ns, mode)))
return -EPERM;
- if (mode & PTRACE_MODE_SCHED)
- return 0;
return security_ptrace_access_check(task, mode);
}
-bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode)
-{
- return __ptrace_may_access(task, mode | PTRACE_MODE_SCHED);
-}
-
bool ptrace_may_access(struct task_struct *task, unsigned int mode)
{
int err;
diff --git a/kernel/resource.c b/kernel/resource.c
index b3a3a1fc499e..b0fbf685c77a 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -319,16 +319,23 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
/**
- * Finds the lowest iomem resource that covers part of [start..end]. The
- * caller must specify start, end, flags, and desc (which may be
+ * Finds the lowest iomem resource that covers part of [@start..@end]. The
+ * caller must specify @start, @end, @flags, and @desc (which may be
* IORES_DESC_NONE).
*
- * If a resource is found, returns 0 and *res is overwritten with the part
- * of the resource that's within [start..end]; if none is found, returns
- * -1.
+ * If a resource is found, returns 0 and @*res is overwritten with the part
+ * of the resource that's within [@start..@end]; if none is found, returns
+ * -1 or -EINVAL for other invalid parameters.
*
* This function walks the whole tree and not just first level children
* unless @first_lvl is true.
+ *
+ * @start: start address of the resource searched for
+ * @end: end address of same resource
+ * @flags: flags which the resource must have
+ * @desc: descriptor the resource must have
+ * @first_lvl: walk only the first level children, if set
+ * @res: return ptr, if resource found
*/
static int find_next_iomem_res(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
@@ -399,6 +406,8 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
* @flags: I/O resource flags
* @start: start addr
* @end: end addr
+ * @arg: function argument for the callback @func
+ * @func: callback function that is called for each qualifying resource area
*
* NOTE: For a new descriptor search, define a new IORES_DESC in
* <linux/ioport.h> and set it in 'desc' of a target resource entry.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f12225f26b70..6fedf3a98581 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5738,15 +5738,10 @@ int sched_cpu_activate(unsigned int cpu)
#ifdef CONFIG_SCHED_SMT
/*
- * The sched_smt_present static key needs to be evaluated on every
- * hotplug event because at boot time SMT might be disabled when
- * the number of booted CPUs is limited.
- *
- * If then later a sibling gets hotplugged, then the key would stay
- * off and SMT scheduling would never be functional.
+ * When going up, increment the number of cores with SMT present.
*/
- if (cpumask_weight(cpu_smt_mask(cpu)) > 1)
- static_branch_enable_cpuslocked(&sched_smt_present);
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_inc_cpuslocked(&sched_smt_present);
#endif
set_cpu_active(cpu, true);
@@ -5790,6 +5785,14 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
synchronize_rcu_mult(call_rcu, call_rcu_sched);
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * When going down, decrement the number of cores with SMT present.
+ */
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_dec_cpuslocked(&sched_smt_present);
+#endif
+
if (!sched_smp_initialized)
return 0;
@@ -5851,11 +5854,14 @@ void __init sched_init_smp(void)
/*
* There's no userspace yet to cause hotplug operations; hence all the
* CPU masks are stable and all blatant races in the below code cannot
- * happen.
+ * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
+ * but there won't be any contention on it.
*/
+ cpus_read_lock();
mutex_lock(&sched_domains_mutex);
sched_init_domains(cpu_active_mask);
mutex_unlock(&sched_domains_mutex);
+ cpus_read_unlock();
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee271bb661cc..ac855b2f4774 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2400,8 +2400,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
local = 1;
/*
- * Retry task to preferred node migration periodically, in case it
- * case it previously failed, or the scheduler moved us.
+ * Retry to migrate task to preferred node periodically, in case it
+ * previously failed, or the scheduler moved us.
*/
if (time_after(jiffies, p->numa_migrate_retry)) {
task_numa_placement(p);
@@ -5674,11 +5674,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return target;
}
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
+static unsigned long cpu_util_without(int cpu, struct task_struct *p);
-static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
{
- return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
+ return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
}
/*
@@ -5738,7 +5738,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
- spare_cap = capacity_spare_wake(i, p);
+ spare_cap = capacity_spare_without(i, p);
if (spare_cap > max_spare_cap)
max_spare_cap = spare_cap;
@@ -5889,8 +5889,8 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return prev_cpu;
/*
- * We need task's util for capacity_spare_wake, sync it up to prev_cpu's
- * last_update_time.
+ * We need task's util for capacity_spare_without, sync it up to
+ * prev_cpu's last_update_time.
*/
if (!(sd_flag & SD_BALANCE_FORK))
sync_entity_load_avg(&p->se);
@@ -6216,10 +6216,19 @@ static inline unsigned long cpu_util(int cpu)
}
/*
- * cpu_util_wake: Compute CPU utilization with any contributions from
- * the waking task p removed.
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
*/
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
struct cfs_rq *cfs_rq;
unsigned int util;
@@ -6231,7 +6240,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
- /* Discount task's blocked util from CPU's util */
+ /* Discount task's util from CPU's util */
util -= min_t(unsigned int, util, task_util(p));
/*
@@ -6240,14 +6249,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
* a) if *p is the only task sleeping on this CPU, then:
* cpu_util (== task_util) > util_est (== 0)
* and thus we return:
- * cpu_util_wake = (cpu_util - task_util) = 0
+ * cpu_util_without = (cpu_util - task_util) = 0
*
* b) if other tasks are SLEEPING on this CPU, which is now exiting
* IDLE, then:
* cpu_util >= task_util
* cpu_util > util_est (== 0)
* and thus we discount *p's blocked utilization to return:
- * cpu_util_wake = (cpu_util - task_util) >= 0
+ * cpu_util_without = (cpu_util - task_util) >= 0
*
* c) if other tasks are RUNNABLE on that CPU and
* util_est > cpu_util
@@ -6260,8 +6269,33 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
* covered by the following code when estimated utilization is
* enabled.
*/
- if (sched_feat(UTIL_EST))
- util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+ if (sched_feat(UTIL_EST)) {
+ unsigned int estimated =
+ READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+ /*
+ * Despite the following checks we still have a small window
+ * for a possible race, when an execl's select_task_rq_fair()
+ * races with LB's detach_task():
+ *
+ * detach_task()
+ * p->on_rq = TASK_ON_RQ_MIGRATING;
+ * ---------------------------------- A
+ * deactivate_task() \
+ * dequeue_task() + RaceTime
+ * util_est_dequeue() /
+ * ---------------------------------- B
+ *
+ * The additional check on "current == p" it's required to
+ * properly fix the execl regression and it helps in further
+ * reducing the chances for the above race.
+ */
+ if (unlikely(task_on_rq_queued(p) || current == p)) {
+ estimated -= min_t(unsigned int, estimated,
+ (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+ }
+ util = max(util, estimated);
+ }
/*
* Utilization (estimated) can exceed the CPU capacity, thus let's
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7cdecfc010af..fe24de3fbc93 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -136,8 +136,18 @@
static int psi_bug __read_mostly;
-bool psi_disabled __read_mostly;
-core_param(psi_disabled, psi_disabled, bool, 0644);
+DEFINE_STATIC_KEY_FALSE(psi_disabled);
+
+#ifdef CONFIG_PSI_DEFAULT_DISABLED
+bool psi_enable;
+#else
+bool psi_enable = true;
+#endif
+static int __init setup_psi(char *str)
+{
+ return kstrtobool(str, &psi_enable) == 0;
+}
+__setup("psi=", setup_psi);
/* Running averages - we need to be higher-res than loadavg */
#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */
@@ -169,8 +179,10 @@ static void group_init(struct psi_group *group)
void __init psi_init(void)
{
- if (psi_disabled)
+ if (!psi_enable) {
+ static_branch_enable(&psi_disabled);
return;
+ }
psi_period = jiffies_to_nsecs(PSI_FREQ);
group_init(&psi_system);
@@ -549,7 +561,7 @@ void psi_memstall_enter(unsigned long *flags)
struct rq_flags rf;
struct rq *rq;
- if (psi_disabled)
+ if (static_branch_likely(&psi_disabled))
return;
*flags = current->flags & PF_MEMSTALL;
@@ -579,7 +591,7 @@ void psi_memstall_leave(unsigned long *flags)
struct rq_flags rf;
struct rq *rq;
- if (psi_disabled)
+ if (static_branch_likely(&psi_disabled))
return;
if (*flags)
@@ -600,7 +612,7 @@ void psi_memstall_leave(unsigned long *flags)
#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgroup)
{
- if (psi_disabled)
+ if (static_branch_likely(&psi_disabled))
return 0;
cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
@@ -612,7 +624,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
void psi_cgroup_free(struct cgroup *cgroup)
{
- if (psi_disabled)
+ if (static_branch_likely(&psi_disabled))
return;
cancel_delayed_work_sync(&cgroup->psi.clock_work);
@@ -633,38 +645,39 @@ void psi_cgroup_free(struct cgroup *cgroup)
*/
void cgroup_move_task(struct task_struct *task, struct css_set *to)
{
- bool move_psi = !psi_disabled;
unsigned int task_flags = 0;
struct rq_flags rf;
struct rq *rq;
- if (move_psi) {
- rq = task_rq_lock(task, &rf);
+ if (static_branch_likely(&psi_disabled)) {
+ /*
+ * Lame to do this here, but the scheduler cannot be locked
+ * from the outside, so we move cgroups from inside sched/.
+ */
+ rcu_assign_pointer(task->cgroups, to);
+ return;
+ }
- if (task_on_rq_queued(task))
- task_flags = TSK_RUNNING;
- else if (task->in_iowait)
- task_flags = TSK_IOWAIT;
+ rq = task_rq_lock(task, &rf);
- if (task->flags & PF_MEMSTALL)
- task_flags |= TSK_MEMSTALL;
+ if (task_on_rq_queued(task))
+ task_flags = TSK_RUNNING;
+ else if (task->in_iowait)
+ task_flags = TSK_IOWAIT;
- if (task_flags)
- psi_task_change(task, task_flags, 0);
- }
+ if (task->flags & PF_MEMSTALL)
+ task_flags |= TSK_MEMSTALL;
- /*
- * Lame to do this here, but the scheduler cannot be locked
- * from the outside, so we move cgroups from inside sched/.
- */
+ if (task_flags)
+ psi_task_change(task, task_flags, 0);
+
+ /* See comment above */
rcu_assign_pointer(task->cgroups, to);
- if (move_psi) {
- if (task_flags)
- psi_task_change(task, 0, task_flags);
+ if (task_flags)
+ psi_task_change(task, 0, task_flags);
- task_rq_unlock(rq, task, &rf);
- }
+ task_rq_unlock(rq, task, &rf);
}
#endif /* CONFIG_CGROUPS */
@@ -672,7 +685,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
{
int full;
- if (psi_disabled)
+ if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
update_stats(group);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 618577fc9aa8..4e524ab589c9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -23,6 +23,7 @@
#include <linux/sched/prio.h>
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
+#include <linux/sched/smt.h>
#include <linux/sched/stat.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/task.h>
@@ -936,9 +937,6 @@ static inline int cpu_of(struct rq *rq)
#ifdef CONFIG_SCHED_SMT
-
-extern struct static_key_false sched_smt_present;
-
extern void __update_idle_core(struct rq *rq);
static inline void update_idle_core(struct rq *rq)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4904c4677000..aa0de240fb41 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -66,7 +66,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
{
int clear = 0, set = TSK_RUNNING;
- if (psi_disabled)
+ if (static_branch_likely(&psi_disabled))
return;
if (!wakeup || p->sched_psi_wake_requeue) {
@@ -86,7 +86,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
{
int clear = TSK_RUNNING, set = 0;
- if (psi_disabled)
+ if (static_branch_likely(&psi_disabled))
return;
if (!sleep) {
@@ -102,7 +102,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
static inline void psi_ttwu_dequeue(struct task_struct *p)
{
- if (psi_disabled)
+ if (static_branch_likely(&psi_disabled))
return;
/*
* Is the task being migrated during a wakeup? Make sure to
@@ -128,7 +128,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
static inline void psi_task_tick(struct rq *rq)
{
- if (psi_disabled)
+ if (static_branch_likely(&psi_disabled))
return;
if (unlikely(rq->curr->flags & PF_MEMSTALL))
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index e42892926244..b193a59fc05b 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -11,6 +11,7 @@
*/
#include <linux/stackleak.h>
+#include <linux/kprobes.h>
#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
#include <linux/jump_label.h>
@@ -47,7 +48,7 @@ int stack_erasing_sysctl(struct ctl_table *table, int write,
#define skip_erasing() false
#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */
-asmlinkage void stackleak_erase(void)
+asmlinkage void notrace stackleak_erase(void)
{
/* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */
unsigned long kstack_ptr = current->lowest_stack;
@@ -101,8 +102,9 @@ asmlinkage void stackleak_erase(void)
/* Reset the 'lowest_stack' value for the next syscall */
current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64;
}
+NOKPROBE_SYMBOL(stackleak_erase);
-void __used stackleak_track_stack(void)
+void __used notrace stackleak_track_stack(void)
{
/*
* N.B. stackleak_erase() fills the kernel stack with the poison value,
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index ce32cf741b25..8f0644af40be 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -917,9 +917,6 @@ static void check_process_timers(struct task_struct *tsk,
struct task_cputime cputime;
unsigned long soft;
- if (dl_task(tsk))
- check_dl_overrun(tsk);
-
/*
* If cputimer is not running, then there are no active
* process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 08fcfe440c63..9864a35c8bb5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -196,11 +196,13 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
i++;
} else if (fmt[i] == 'p' || fmt[i] == 's') {
mod[fmt_cnt]++;
- i++;
- if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
+ /* disallow any further format extensions */
+ if (fmt[i + 1] != 0 &&
+ !isspace(fmt[i + 1]) &&
+ !ispunct(fmt[i + 1]))
return -EINVAL;
fmt_cnt++;
- if (fmt[i - 1] == 's') {
+ if (fmt[i] == 's') {
if (str_seen)
/* allow only one '%s' per fmt string */
return -EINVAL;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f536f601bd46..77734451cb05 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -817,7 +817,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static int profile_graph_entry(struct ftrace_graph_ent *trace)
{
- int index = trace->depth;
+ int index = current->curr_ret_stack;
function_profile_call(trace->func, 0, NULL, NULL);
@@ -852,7 +852,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
if (!fgraph_graph_time) {
int index;
- index = trace->depth;
+ index = current->curr_ret_stack;
/* Append this call time to the parent time to subtract */
if (index)
@@ -6814,6 +6814,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
atomic_set(&t->tracing_graph_pause, 0);
atomic_set(&t->trace_overrun, 0);
t->curr_ret_stack = -1;
+ t->curr_ret_depth = -1;
/* Make sure the tasks see the -1 first: */
smp_wmb();
t->ret_stack = ret_stack_list[start++];
@@ -7038,6 +7039,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
{
t->curr_ret_stack = -1;
+ t->curr_ret_depth = -1;
/*
* The idle task has no parent, it either has its own
* stack or no stack at all.
@@ -7068,6 +7070,7 @@ void ftrace_graph_init_task(struct task_struct *t)
/* Make sure we do not use the parent ret_stack */
t->ret_stack = NULL;
t->curr_ret_stack = -1;
+ t->curr_ret_depth = -1;
if (ftrace_graph_active) {
struct ftrace_ret_stack *ret_stack;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3b8c0e24ab30..447bd96ee658 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -512,12 +512,44 @@ enum {
* can only be modified by current, we can reuse trace_recursion.
*/
TRACE_IRQ_BIT,
+
+ /* Set if the function is in the set_graph_function file */
+ TRACE_GRAPH_BIT,
+
+ /*
+ * In the very unlikely case that an interrupt came in
+ * at a start of graph tracing, and we want to trace
+ * the function in that interrupt, the depth can be greater
+ * than zero, because of the preempted start of a previous
+ * trace. In an even more unlikely case, depth could be 2
+ * if a softirq interrupted the start of graph tracing,
+ * followed by an interrupt preempting a start of graph
+ * tracing in the softirq, and depth can even be 3
+ * if an NMI came in at the start of an interrupt function
+ * that preempted a softirq start of a function that
+ * preempted normal context!!!! Luckily, it can't be
+ * greater than 3, so the next two bits are a mask
+ * of what the depth is when we set TRACE_GRAPH_BIT
+ */
+
+ TRACE_GRAPH_DEPTH_START_BIT,
+ TRACE_GRAPH_DEPTH_END_BIT,
};
#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit)))
+#define trace_recursion_depth() \
+ (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3)
+#define trace_recursion_set_depth(depth) \
+ do { \
+ current->trace_recursion &= \
+ ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \
+ current->trace_recursion |= \
+ ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \
+ } while (0)
+
#define TRACE_CONTEXT_BITS 4
#define TRACE_FTRACE_START TRACE_FTRACE_BIT
@@ -843,8 +875,9 @@ extern void __trace_graph_return(struct trace_array *tr,
extern struct ftrace_hash *ftrace_graph_hash;
extern struct ftrace_hash *ftrace_graph_notrace_hash;
-static inline int ftrace_graph_addr(unsigned long addr)
+static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
{
+ unsigned long addr = trace->func;
int ret = 0;
preempt_disable_notrace();
@@ -855,6 +888,14 @@ static inline int ftrace_graph_addr(unsigned long addr)
}
if (ftrace_lookup_ip(ftrace_graph_hash, addr)) {
+
+ /*
+ * This needs to be cleared on the return functions
+ * when the depth is zero.
+ */
+ trace_recursion_set(TRACE_GRAPH_BIT);
+ trace_recursion_set_depth(trace->depth);
+
/*
* If no irqs are to be traced, but a set_graph_function
* is set, and called by an interrupt handler, we still
@@ -872,6 +913,13 @@ out:
return ret;
}
+static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+{
+ if (trace_recursion_test(TRACE_GRAPH_BIT) &&
+ trace->depth == trace_recursion_depth())
+ trace_recursion_clear(TRACE_GRAPH_BIT);
+}
+
static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
int ret = 0;
@@ -885,7 +933,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
return ret;
}
#else
-static inline int ftrace_graph_addr(unsigned long addr)
+static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
{
return 1;
}
@@ -894,6 +942,8 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
return 0;
}
+static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+{ }
#endif /* CONFIG_DYNAMIC_FTRACE */
extern unsigned int fgraph_max_depth;
@@ -901,7 +951,8 @@ extern unsigned int fgraph_max_depth;
static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)
{
/* trace it when it is-nested-in or is a function enabled. */
- return !(trace->depth || ftrace_graph_addr(trace->func)) ||
+ return !(trace_recursion_test(TRACE_GRAPH_BIT) ||
+ ftrace_graph_addr(trace)) ||
(trace->depth < 0) ||
(fgraph_max_depth && trace->depth >= fgraph_max_depth);
}
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 169b3c44ee97..086af4f5c3e8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -118,8 +118,8 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
struct trace_seq *s, u32 flags);
/* Add a function return address to the trace stack on thread info.*/
-int
-ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
+static int
+ftrace_push_return_trace(unsigned long ret, unsigned long func,
unsigned long frame_pointer, unsigned long *retp)
{
unsigned long long calltime;
@@ -177,9 +177,31 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
current->ret_stack[index].retp = retp;
#endif
- *depth = current->curr_ret_stack;
+ return 0;
+}
+
+int function_graph_enter(unsigned long ret, unsigned long func,
+ unsigned long frame_pointer, unsigned long *retp)
+{
+ struct ftrace_graph_ent trace;
+
+ trace.func = func;
+ trace.depth = ++current->curr_ret_depth;
+
+ if (ftrace_push_return_trace(ret, func,
+ frame_pointer, retp))
+ goto out;
+
+ /* Only trace if the calling function expects to */
+ if (!ftrace_graph_entry(&trace))
+ goto out_ret;
return 0;
+ out_ret:
+ current->curr_ret_stack--;
+ out:
+ current->curr_ret_depth--;
+ return -EBUSY;
}
/* Retrieve a function return address to the trace stack on thread info.*/
@@ -241,7 +263,13 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
trace->func = current->ret_stack[index].func;
trace->calltime = current->ret_stack[index].calltime;
trace->overrun = atomic_read(&current->trace_overrun);
- trace->depth = index;
+ trace->depth = current->curr_ret_depth--;
+ /*
+ * We still want to trace interrupts coming in if
+ * max_depth is set to 1. Make sure the decrement is
+ * seen before ftrace_graph_return.
+ */
+ barrier();
}
/*
@@ -255,6 +283,12 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
ftrace_pop_return_trace(&trace, &ret, frame_pointer);
trace.rettime = trace_clock_local();
+ ftrace_graph_return(&trace);
+ /*
+ * The ftrace_graph_return() may still access the current
+ * ret_stack structure, we need to make sure the update of
+ * curr_ret_stack is after that.
+ */
barrier();
current->curr_ret_stack--;
/*
@@ -267,13 +301,6 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
return ret;
}
- /*
- * The trace should run after decrementing the ret counter
- * in case an interrupt were to come in. We don't want to
- * lose the interrupt if max_depth is set.
- */
- ftrace_graph_return(&trace);
-
if (unlikely(!ret)) {
ftrace_graph_stop();
WARN_ON(1);
@@ -482,6 +509,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
int cpu;
int pc;
+ ftrace_graph_addr_finish(trace);
+
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -505,6 +534,8 @@ void set_graph_array(struct trace_array *tr)
static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
{
+ ftrace_graph_addr_finish(trace);
+
if (tracing_thresh &&
(trace->rettime - trace->calltime < tracing_thresh))
return;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b7357f9f82a3..98ea6d28df15 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -208,6 +208,8 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
unsigned long flags;
int pc;
+ ftrace_graph_addr_finish(trace);
+
if (!func_prolog_dec(tr, &data, &flags))
return;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index a86b303e6c67..7d04b9890755 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -270,6 +270,8 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace)
unsigned long flags;
int pc;
+ ftrace_graph_addr_finish(trace);
+
if (!func_prolog_preempt_disable(tr, &data, &pc))
return;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index e5222b5fb4fe..923414a246e9 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -974,10 +974,6 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
goto out;
- ret = sort_idmaps(&new_map);
- if (ret < 0)
- goto out;
-
ret = -EPERM;
/* Map the lower ids from the parent user namespace to the
* kernel global id space.
@@ -1004,6 +1000,14 @@ static ssize_t map_write(struct file *file, const char __user *buf,
e->lower_first = lower_first;
}
+ /*
+ * If we want to use binary search for lookup, this clones the extent
+ * array and sorts both copies.
+ */
+ ret = sort_idmaps(&new_map);
+ if (ret < 0)
+ goto out;
+
/* Install the map */
if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
memcpy(map->extent, new_map.extent,