diff options
author | Thomas Gleixner <tglx@linutronix.de> | 2020-05-11 22:54:30 +0200 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2020-05-11 22:54:30 +0200 |
commit | 68f0f2690e183306b52671a9ad09fb31808b0500 (patch) | |
tree | ed86c9a38d307f609ccaab0baac367df3e7e5e39 /kernel | |
parent | baf5fe761846815164753d1bd0638fd3696db8fd (diff) | |
parent | f736e0f1a55a88cb258b73da77463573739e9ac9 (diff) |
Merge branch 'for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into core/rcu
Pull RCU updates from Paul McKenney:
1. Miscellaneous fixes.
2. kfree_rcu() updates.
3. Remove scheduler locking restriction
4. RCU-tasks update, including addition of RCU Tasks Trace for
BPF use and RCU Tasks Rude. (This branch is on top of #3 due
to overlap of changed code.)
5. RCU CPU stall warning updates.
6. Torture-test updates.
Diffstat (limited to 'kernel')
149 files changed, 8945 insertions, 3973 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index 34d1e77ee9df..78701ea37c97 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,6 +1,4 @@ -# -# Generated files -# +# SPDX-License-Identifier: GPL-2.0-only kheaders.md5 timeconst.h hz.bc diff --git a/kernel/audit.c b/kernel/audit.c index 9ddfe2aa6671..b69c8b460341 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1800,6 +1800,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } audit_get_stamp(ab->ctx, &t, &serial); + audit_clear_dummy(ab->ctx); audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); diff --git a/kernel/audit.h b/kernel/audit.h index 6fb7160412d4..2eed4d231624 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -290,6 +290,13 @@ extern int audit_signal_info_syscall(struct task_struct *t); extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); + +static inline void audit_clear_dummy(struct audit_context *ctx) +{ + if (ctx) + ctx->dummy = 0; +} + #else /* CONFIG_AUDITSYSCALL */ #define auditsc_get_stamp(c, t, s) 0 #define audit_put_watch(w) {} @@ -323,6 +330,7 @@ static inline int audit_signal_info_syscall(struct task_struct *t) } #define audit_filter_inodes(t, c) AUDIT_DISABLED +#define audit_clear_dummy(c) {} #endif /* CONFIG_AUDITSYSCALL */ extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index f0d243318452..3596448bfdab 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -160,23 +160,14 @@ static int audit_mark_handle_event(struct fsnotify_group *group, { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct audit_fsnotify_mark *audit_mark; - const struct inode *inode = NULL; + const struct inode *inode = fsnotify_data_inode(data, data_type); audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); BUG_ON(group != audit_fsnotify_group); - switch (data_type) { - case (FSNOTIFY_EVENT_PATH): - inode = ((const struct path *)data)->dentry->d_inode; - break; - case (FSNOTIFY_EVENT_INODE): - inode = (const struct inode *)data; - break; - default: - BUG(); + if (WARN_ON(!inode)) return 0; - } if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 4508d5e0cf69..e09c551ae52d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -302,8 +302,6 @@ static void audit_update_watch(struct audit_parent *parent, if (oentry->rule.exe) audit_remove_mark(oentry->rule.exe); - audit_watch_log_rule_change(r, owatch, "updated_rules"); - call_rcu(&oentry->rcu, audit_free_rule_rcu); } @@ -473,25 +471,13 @@ static int audit_watch_handle_event(struct fsnotify_group *group, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); - const struct inode *inode; + const struct inode *inode = fsnotify_data_inode(data, data_type); struct audit_parent *parent; parent = container_of(inode_mark, struct audit_parent, mark); BUG_ON(group != audit_watch_group); - - switch (data_type) { - case (FSNOTIFY_EVENT_PATH): - inode = d_backing_inode(((const struct path *)data)->dentry); - break; - case (FSNOTIFY_EVENT_INODE): - inode = (const struct inode *)data; - break; - default: - BUG(); - inode = NULL; - break; - } + WARN_ON(!inode); if (mask & (FS_CREATE|FS_MOVED_TO) && inode) audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4effe01ebbe2..814406a35db1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1406,9 +1406,6 @@ static void audit_log_proctitle(void) struct audit_context *context = audit_context(); struct audit_buffer *ab; - if (!context || context->dummy) - return; - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); if (!ab) return; /* audit_panic or being filtered */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 046ce5d98033..f2d7be596966 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -29,4 +29,5 @@ obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o endif ifeq ($(CONFIG_BPF_JIT),y) obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o +obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index f02504640e18..6b12f06ee18c 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -30,7 +30,7 @@ struct bpf_lru_node { struct bpf_lru_list { struct list_head lists[NR_BPF_LRU_LIST_T]; unsigned int counts[NR_BPF_LRU_LIST_COUNT]; - /* The next inacitve list rotation starts from here */ + /* The next inactive list rotation starts from here */ struct list_head *next_inactive_rotation; raw_spinlock_t lock ____cacheline_aligned_in_smp; diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c new file mode 100644 index 000000000000..19636703b24e --- /dev/null +++ b/kernel/bpf/bpf_lsm.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2020 Google LLC. + */ + +#include <linux/filter.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/lsm_hooks.h> +#include <linux/bpf_lsm.h> +#include <linux/kallsyms.h> +#include <linux/bpf_verifier.h> + +/* For every LSM hook that allows attachment of BPF programs, declare a nop + * function where a BPF program can be attached. + */ +#define LSM_HOOK(RET, DEFAULT, NAME, ...) \ +noinline RET bpf_lsm_##NAME(__VA_ARGS__) \ +{ \ + return DEFAULT; \ +} + +#include <linux/lsm_hook_defs.h> +#undef LSM_HOOK + +#define BPF_LSM_SYM_PREFX "bpf_lsm_" + +int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog) +{ + if (!prog->gpl_compatible) { + bpf_log(vlog, + "LSM programs must have a GPL compatible license\n"); + return -EINVAL; + } + + if (strncmp(BPF_LSM_SYM_PREFX, prog->aux->attach_func_name, + sizeof(BPF_LSM_SYM_PREFX) - 1)) { + bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", + prog->aux->attach_btf_id, prog->aux->attach_func_name); + return -EINVAL; + } + + return 0; +} + +const struct bpf_prog_ops lsm_prog_ops = { +}; + +const struct bpf_verifier_ops lsm_verifier_ops = { + .get_func_proto = bpf_tracing_func_proto, + .is_valid_access = btf_ctx_access, +}; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 042f95534f86..26cb51f2db72 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -23,7 +23,7 @@ enum bpf_struct_ops_state { struct bpf_struct_ops_value { BPF_STRUCT_OPS_COMMON_VALUE; - char data[0] ____cacheline_aligned_in_smp; + char data[] ____cacheline_aligned_in_smp; }; struct bpf_struct_ops_map { @@ -320,6 +320,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, struct bpf_struct_ops_value *uvalue, *kvalue; const struct btf_member *member; const struct btf_type *t = st_ops->type; + struct bpf_tramp_progs *tprogs = NULL; void *udata, *kdata; int prog_fd, err = 0; void *image; @@ -343,6 +344,10 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (uvalue->state || refcount_read(&uvalue->refcnt)) return -EINVAL; + tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); + if (!tprogs) + return -ENOMEM; + uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; @@ -425,10 +430,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto reset_unlock; } + tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; + tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; err = arch_prepare_bpf_trampoline(image, st_map->image + PAGE_SIZE, &st_ops->func_models[i], 0, - &prog, 1, NULL, 0, NULL); + tprogs, NULL); if (err < 0) goto reset_unlock; @@ -469,6 +476,7 @@ reset_unlock: memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); unlock: + kfree(tprogs); mutex_unlock(&st_map->lock); return err; } @@ -482,13 +490,21 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) prev_state = cmpxchg(&st_map->kvalue.state, BPF_STRUCT_OPS_STATE_INUSE, BPF_STRUCT_OPS_STATE_TOBEFREE); - if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) { + switch (prev_state) { + case BPF_STRUCT_OPS_STATE_INUSE: st_map->st_ops->unreg(&st_map->kvalue.data); if (refcount_dec_and_test(&st_map->kvalue.refcnt)) bpf_map_put(map); + return 0; + case BPF_STRUCT_OPS_STATE_TOBEFREE: + return -EINPROGRESS; + case BPF_STRUCT_OPS_STATE_INIT: + return -ENOENT; + default: + WARN_ON_ONCE(1); + /* Should never happen. Treat it as not found. */ + return -ENOENT; } - - return 0; } static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 787140095e58..d65c6912bdaf 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2418,7 +2418,7 @@ static int btf_enum_check_member(struct btf_verifier_env *env, struct_size = struct_type->size; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); - if (struct_size - bytes_offset < sizeof(int)) { + if (struct_size - bytes_offset < member_type->size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; @@ -3477,8 +3477,8 @@ errout: return ERR_PTR(err); } -extern char __weak _binary__btf_vmlinux_bin_start[]; -extern char __weak _binary__btf_vmlinux_bin_end[]; +extern char __weak __start_BTF[]; +extern char __weak __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) @@ -3605,9 +3605,8 @@ struct btf *btf_parse_vmlinux(void) } env->btf = btf; - btf->data = _binary__btf_vmlinux_bin_start; - btf->data_size = _binary__btf_vmlinux_bin_end - - _binary__btf_vmlinux_bin_start; + btf->data = __start_BTF; + btf->data_size = __stop_BTF - __start_BTF; err = btf_parse_hdr(env); if (err) @@ -3710,23 +3709,60 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, nr_args--; } - if (prog->expected_attach_type == BPF_TRACE_FEXIT && - arg == nr_args) { - if (!t) - /* Default prog with 5 args. 6th arg is retval. */ - return true; - /* function return type */ - t = btf_type_by_id(btf, t->type); - } else if (arg >= nr_args) { + if (arg > nr_args) { bpf_log(log, "func '%s' doesn't have %d-th argument\n", tname, arg + 1); return false; + } + + if (arg == nr_args) { + switch (prog->expected_attach_type) { + case BPF_LSM_MAC: + case BPF_TRACE_FEXIT: + /* When LSM programs are attached to void LSM hooks + * they use FEXIT trampolines and when attached to + * int LSM hooks, they use MODIFY_RETURN trampolines. + * + * While the LSM programs are BPF_MODIFY_RETURN-like + * the check: + * + * if (ret_type != 'int') + * return -EINVAL; + * + * is _not_ done here. This is still safe as LSM hooks + * have only void and int return types. + */ + if (!t) + return true; + t = btf_type_by_id(btf, t->type); + break; + case BPF_MODIFY_RETURN: + /* For now the BPF_MODIFY_RETURN can only be attached to + * functions that return an int. + */ + if (!t) + return false; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + if (!btf_type_is_int(t)) { + bpf_log(log, + "ret type %s not allowed for fmod_ret\n", + btf_kind_str[BTF_INFO_KIND(t->info)]); + return false; + } + break; + default: + bpf_log(log, "func '%s' doesn't have %d-th argument\n", + tname, arg + 1); + return false; + } } else { if (!t) /* Default prog with 5 args */ return true; t = btf_type_by_id(btf, args[arg].type); } + /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); @@ -4564,7 +4600,7 @@ int btf_get_info_by_fd(const struct btf *btf, union bpf_attr __user *uattr) { struct bpf_btf_info __user *uinfo; - struct bpf_btf_info info = {}; + struct bpf_btf_info info; u32 info_copy, btf_copy; void __user *ubtf; u32 uinfo_len; @@ -4573,6 +4609,7 @@ int btf_get_info_by_fd(const struct btf *btf, uinfo_len = attr->info.info_len; info_copy = min_t(u32, uinfo_len, sizeof(info)); + memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_copy)) return -EFAULT; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9a500fadbef5..cb305e71e7de 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -28,6 +28,69 @@ void cgroup_bpf_offline(struct cgroup *cgrp) percpu_ref_kill(&cgrp->bpf.refcnt); } +static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[]) +{ + enum bpf_cgroup_storage_type stype; + + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storages[stype]); +} + +static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[], + struct bpf_prog *prog) +{ + enum bpf_cgroup_storage_type stype; + + for_each_cgroup_storage_type(stype) { + storages[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(storages[stype])) { + storages[stype] = NULL; + bpf_cgroup_storages_free(storages); + return -ENOMEM; + } + } + + return 0; +} + +static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[], + struct bpf_cgroup_storage *src[]) +{ + enum bpf_cgroup_storage_type stype; + + for_each_cgroup_storage_type(stype) + dst[stype] = src[stype]; +} + +static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[], + struct cgroup* cgrp, + enum bpf_attach_type attach_type) +{ + enum bpf_cgroup_storage_type stype; + + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_link(storages[stype], cgrp, attach_type); +} + +static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[]) +{ + enum bpf_cgroup_storage_type stype; + + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_unlink(storages[stype]); +} + +/* Called when bpf_cgroup_link is auto-detached from dying cgroup. + * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It + * doesn't free link memory, which will eventually be done by bpf_link's + * release() callback, when its last FD is closed. + */ +static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link) +{ + cgroup_put(link->cgroup); + link->cgroup = NULL; +} + /** * cgroup_bpf_release() - put references of all bpf programs and * release all cgroup bpf data @@ -37,7 +100,6 @@ static void cgroup_bpf_release(struct work_struct *work) { struct cgroup *p, *cgrp = container_of(work, struct cgroup, bpf.release_work); - enum bpf_cgroup_storage_type stype; struct bpf_prog_array *old_array; unsigned int type; @@ -49,11 +111,12 @@ static void cgroup_bpf_release(struct work_struct *work) list_for_each_entry_safe(pl, tmp, progs, node) { list_del(&pl->node); - bpf_prog_put(pl->prog); - for_each_cgroup_storage_type(stype) { - bpf_cgroup_storage_unlink(pl->storage[stype]); - bpf_cgroup_storage_free(pl->storage[stype]); - } + if (pl->prog) + bpf_prog_put(pl->prog); + if (pl->link) + bpf_cgroup_link_auto_detach(pl->link); + bpf_cgroup_storages_unlink(pl->storage); + bpf_cgroup_storages_free(pl->storage); kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -85,6 +148,18 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref) queue_work(system_wq, &cgrp->bpf.release_work); } +/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through + * link or direct prog. + */ +static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) +{ + if (pl->prog) + return pl->prog; + if (pl->link) + return pl->link->link.prog; + return NULL; +} + /* count number of elements in the list. * it's slow but the list cannot be long */ @@ -94,7 +169,7 @@ static u32 prog_list_length(struct list_head *head) u32 cnt = 0; list_for_each_entry(pl, head, node) { - if (!pl->prog) + if (!prog_list_prog(pl)) continue; cnt++; } @@ -138,7 +213,7 @@ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, struct bpf_prog_array **array) { - enum bpf_cgroup_storage_type stype; + struct bpf_prog_array_item *item; struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; @@ -163,13 +238,13 @@ static int compute_effective_progs(struct cgroup *cgrp, continue; list_for_each_entry(pl, &p->bpf.progs[type], node) { - if (!pl->prog) + if (!prog_list_prog(pl)) continue; - progs->items[cnt].prog = pl->prog; - for_each_cgroup_storage_type(stype) - progs->items[cnt].cgroup_storage[stype] = - pl->storage[stype]; + item = &progs->items[cnt]; + item->prog = prog_list_prog(pl); + bpf_cgroup_storages_assign(item->cgroup_storage, + pl->storage); cnt++; } } while ((p = cgroup_parent(p))); @@ -227,6 +302,9 @@ cleanup: for (i = 0; i < NR; i++) bpf_prog_array_free(arrays[i]); + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_put(p); + percpu_ref_exit(&cgrp->bpf.refcnt); return -ENOMEM; @@ -284,34 +362,80 @@ cleanup: #define BPF_CGROUP_MAX_PROGS 64 +static struct bpf_prog_list *find_attach_entry(struct list_head *progs, + struct bpf_prog *prog, + struct bpf_cgroup_link *link, + struct bpf_prog *replace_prog, + bool allow_multi) +{ + struct bpf_prog_list *pl; + + /* single-attach case */ + if (!allow_multi) { + if (list_empty(progs)) + return NULL; + return list_first_entry(progs, typeof(*pl), node); + } + + list_for_each_entry(pl, progs, node) { + if (prog && pl->prog == prog) + /* disallow attaching the same prog twice */ + return ERR_PTR(-EINVAL); + if (link && pl->link == link) + /* disallow attaching the same link twice */ + return ERR_PTR(-EINVAL); + } + + /* direct prog multi-attach w/ replacement case */ + if (replace_prog) { + list_for_each_entry(pl, progs, node) { + if (pl->prog == replace_prog) + /* a match found */ + return pl; + } + /* prog to replace not found for cgroup */ + return ERR_PTR(-ENOENT); + } + + return NULL; +} + /** - * __cgroup_bpf_attach() - Attach the program to a cgroup, and + * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to attach + * @link: A link to attach * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set * @type: Type of attach operation * @flags: Option flags * + * Exactly one of @prog or @link can be non-null. * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_prog *replace_prog, +int __cgroup_bpf_attach(struct cgroup *cgrp, + struct bpf_prog *prog, struct bpf_prog *replace_prog, + struct bpf_cgroup_link *link, enum bpf_attach_type type, u32 flags) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], - *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; - struct bpf_prog_list *pl, *replace_pl = NULL; - enum bpf_cgroup_storage_type stype; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_prog_list *pl; int err; if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) /* invalid combination */ return -EINVAL; + if (link && (prog || replace_prog)) + /* only either link or prog/replace_prog can be specified */ + return -EINVAL; + if (!!replace_prog != !!(flags & BPF_F_REPLACE)) + /* replace_prog implies BPF_F_REPLACE, and vice versa */ + return -EINVAL; if (!hierarchy_allows_attach(cgrp, type)) return -EPERM; @@ -326,140 +450,203 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; - if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) { - if (pl->prog == prog) - /* disallow attaching the same prog twice */ - return -EINVAL; - if (pl->prog == replace_prog) - replace_pl = pl; - } - if ((flags & BPF_F_REPLACE) && !replace_pl) - /* prog to replace not found for cgroup */ - return -ENOENT; - } else if (!list_empty(progs)) { - replace_pl = list_first_entry(progs, typeof(*pl), node); - } + pl = find_attach_entry(progs, prog, link, replace_prog, + flags & BPF_F_ALLOW_MULTI); + if (IS_ERR(pl)) + return PTR_ERR(pl); - for_each_cgroup_storage_type(stype) { - storage[stype] = bpf_cgroup_storage_alloc(prog, stype); - if (IS_ERR(storage[stype])) { - storage[stype] = NULL; - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); - return -ENOMEM; - } - } + if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog)) + return -ENOMEM; - if (replace_pl) { - pl = replace_pl; + if (pl) { old_prog = pl->prog; - for_each_cgroup_storage_type(stype) { - old_storage[stype] = pl->storage[stype]; - bpf_cgroup_storage_unlink(old_storage[stype]); - } + bpf_cgroup_storages_unlink(pl->storage); + bpf_cgroup_storages_assign(old_storage, pl->storage); } else { pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storages_free(storage); return -ENOMEM; } list_add_tail(&pl->node, progs); } pl->prog = prog; - for_each_cgroup_storage_type(stype) - pl->storage[stype] = storage[stype]; - + pl->link = link; + bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[type] = saved_flags; err = update_effective_progs(cgrp, type); if (err) goto cleanup; - static_branch_inc(&cgroup_bpf_enabled_key); - for_each_cgroup_storage_type(stype) { - if (!old_storage[stype]) - continue; - bpf_cgroup_storage_free(old_storage[stype]); - } - if (old_prog) { + bpf_cgroup_storages_free(old_storage); + if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key); - } - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_link(storage[stype], cgrp, type); + else + static_branch_inc(&cgroup_bpf_enabled_key); + bpf_cgroup_storages_link(pl->storage, cgrp, type); return 0; cleanup: - /* and cleanup the prog list */ - pl->prog = old_prog; - for_each_cgroup_storage_type(stype) { - bpf_cgroup_storage_free(pl->storage[stype]); - pl->storage[stype] = old_storage[stype]; - bpf_cgroup_storage_link(old_storage[stype], cgrp, type); + if (old_prog) { + pl->prog = old_prog; + pl->link = NULL; } - if (!replace_pl) { + bpf_cgroup_storages_free(pl->storage); + bpf_cgroup_storages_assign(pl->storage, old_storage); + bpf_cgroup_storages_link(pl->storage, cgrp, type); + if (!old_prog) { list_del(&pl->node); kfree(pl); } return err; } +/* Swap updated BPF program for given link in effective program arrays across + * all descendant cgroups. This function is guaranteed to succeed. + */ +static void replace_effective_prog(struct cgroup *cgrp, + enum bpf_attach_type type, + struct bpf_cgroup_link *link) +{ + struct bpf_prog_array_item *item; + struct cgroup_subsys_state *css; + struct bpf_prog_array *progs; + struct bpf_prog_list *pl; + struct list_head *head; + struct cgroup *cg; + int pos; + + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + if (percpu_ref_is_zero(&desc->bpf.refcnt)) + continue; + + /* find position of link in effective progs array */ + for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { + if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + continue; + + head = &cg->bpf.progs[type]; + list_for_each_entry(pl, head, node) { + if (!prog_list_prog(pl)) + continue; + if (pl->link == link) + goto found; + pos++; + } + } +found: + BUG_ON(!cg); + progs = rcu_dereference_protected( + desc->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + item = &progs->items[pos]; + WRITE_ONCE(item->prog, link->link.prog); + } +} + /** - * __cgroup_bpf_detach() - Detach the program from a cgroup, and + * __cgroup_bpf_replace() - Replace link's program and propagate the change + * to descendants + * @cgrp: The cgroup which descendants to traverse + * @link: A link for which to replace BPF program + * @type: Type of attach operation + * + * Must be called with cgroup_mutex held. + */ +int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, + struct bpf_prog *new_prog) +{ + struct list_head *progs = &cgrp->bpf.progs[link->type]; + struct bpf_prog *old_prog; + struct bpf_prog_list *pl; + bool found = false; + + if (link->link.prog->type != new_prog->type) + return -EINVAL; + + list_for_each_entry(pl, progs, node) { + if (pl->link == link) { + found = true; + break; + } + } + if (!found) + return -ENOENT; + + old_prog = xchg(&link->link.prog, new_prog); + replace_effective_prog(cgrp, link->type, link); + bpf_prog_put(old_prog); + return 0; +} + +static struct bpf_prog_list *find_detach_entry(struct list_head *progs, + struct bpf_prog *prog, + struct bpf_cgroup_link *link, + bool allow_multi) +{ + struct bpf_prog_list *pl; + + if (!allow_multi) { + if (list_empty(progs)) + /* report error when trying to detach and nothing is attached */ + return ERR_PTR(-ENOENT); + + /* to maintain backward compatibility NONE and OVERRIDE cgroups + * allow detaching with invalid FD (prog==NULL) in legacy mode + */ + return list_first_entry(progs, typeof(*pl), node); + } + + if (!prog && !link) + /* to detach MULTI prog the user has to specify valid FD + * of the program or link to be detached + */ + return ERR_PTR(-EINVAL); + + /* find the prog or link and detach it */ + list_for_each_entry(pl, progs, node) { + if (pl->prog == prog && pl->link == link) + return pl; + } + return ERR_PTR(-ENOENT); +} + +/** + * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to detach or NULL + * @prog: A link to detach or NULL * @type: Type of detach operation * + * At most one of @prog or @link can be non-NULL. * Must be called with cgroup_mutex held. */ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type) + struct bpf_cgroup_link *link, enum bpf_attach_type type) { struct list_head *progs = &cgrp->bpf.progs[type]; - enum bpf_cgroup_storage_type stype; u32 flags = cgrp->bpf.flags[type]; - struct bpf_prog *old_prog = NULL; struct bpf_prog_list *pl; + struct bpf_prog *old_prog; int err; - if (flags & BPF_F_ALLOW_MULTI) { - if (!prog) - /* to detach MULTI prog the user has to specify valid FD - * of the program to be detached - */ - return -EINVAL; - } else { - if (list_empty(progs)) - /* report error when trying to detach and nothing is attached */ - return -ENOENT; - } + if (prog && link) + /* only one of prog or link can be specified */ + return -EINVAL; - if (flags & BPF_F_ALLOW_MULTI) { - /* find the prog and detach it */ - list_for_each_entry(pl, progs, node) { - if (pl->prog != prog) - continue; - old_prog = prog; - /* mark it deleted, so it's ignored while - * recomputing effective - */ - pl->prog = NULL; - break; - } - if (!old_prog) - return -ENOENT; - } else { - /* to maintain backward compatibility NONE and OVERRIDE cgroups - * allow detaching with invalid FD (prog==NULL) - */ - pl = list_first_entry(progs, typeof(*pl), node); - old_prog = pl->prog; - pl->prog = NULL; - } + pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI); + if (IS_ERR(pl)) + return PTR_ERR(pl); + + /* mark it deleted, so it's ignored while recomputing effective */ + old_prog = pl->prog; + pl->prog = NULL; + pl->link = NULL; err = update_effective_progs(cgrp, type); if (err) @@ -467,22 +654,21 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ list_del(&pl->node); - for_each_cgroup_storage_type(stype) { - bpf_cgroup_storage_unlink(pl->storage[stype]); - bpf_cgroup_storage_free(pl->storage[stype]); - } + bpf_cgroup_storages_unlink(pl->storage); + bpf_cgroup_storages_free(pl->storage); kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ cgrp->bpf.flags[type] = 0; - - bpf_prog_put(old_prog); + if (old_prog) + bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); return 0; cleanup: - /* and restore back old_prog */ + /* restore back prog or link */ pl->prog = old_prog; + pl->link = link; return err; } @@ -495,6 +681,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; struct bpf_prog_array *effective; + struct bpf_prog *prog; int cnt, ret = 0, i; effective = rcu_dereference_protected(cgrp->bpf.effective[type], @@ -525,7 +712,8 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, i = 0; list_for_each_entry(pl, progs, node) { - id = pl->prog->aux->id; + prog = prog_list_prog(pl); + id = prog->aux->id; if (copy_to_user(prog_ids + i, &id, sizeof(id))) return -EFAULT; if (++i == cnt) @@ -555,8 +743,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, } } - ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type, - attr->attach_flags); + ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL, + attr->attach_type, attr->attach_flags); if (replace_prog) bpf_prog_put(replace_prog); @@ -578,7 +766,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) if (IS_ERR(prog)) prog = NULL; - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type); if (prog) bpf_prog_put(prog); @@ -586,6 +774,90 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) return ret; } +static void bpf_cgroup_link_release(struct bpf_link *link) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + + /* link might have been auto-detached by dying cgroup already, + * in that case our work is done here + */ + if (!cg_link->cgroup) + return; + + mutex_lock(&cgroup_mutex); + + /* re-check cgroup under lock again */ + if (!cg_link->cgroup) { + mutex_unlock(&cgroup_mutex); + return; + } + + WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, + cg_link->type)); + + mutex_unlock(&cgroup_mutex); + cgroup_put(cg_link->cgroup); +} + +static void bpf_cgroup_link_dealloc(struct bpf_link *link) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + + kfree(cg_link); +} + +const struct bpf_link_ops bpf_cgroup_link_lops = { + .release = bpf_cgroup_link_release, + .dealloc = bpf_cgroup_link_dealloc, +}; + +int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_cgroup_link *link; + struct file *link_file; + struct cgroup *cgrp; + int err, link_fd; + + if (attr->link_create.flags) + return -EINVAL; + + cgrp = cgroup_get_from_fd(attr->link_create.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_cgroup; + } + bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog); + link->cgroup = cgrp; + link->type = attr->link_create.attach_type; + + link_file = bpf_link_new_file(&link->link, &link_fd); + if (IS_ERR(link_file)) { + kfree(link); + err = PTR_ERR(link_file); + goto out_put_cgroup; + } + + err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, + BPF_F_ALLOW_MULTI); + if (err) { + bpf_link_cleanup(&link->link, link_file, link_fd); + goto out_put_cgroup; + } + + fd_install(link_fd, link_file); + return link_fd; + +out_put_cgroup: + cgroup_put(cgrp); + return err; +} + int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 973a20d49749..916f5132a984 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -97,7 +97,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); - INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); + INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); return fp; } @@ -523,22 +523,22 @@ int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; long bpf_jit_limit __read_mostly; -static __always_inline void -bpf_get_prog_addr_region(const struct bpf_prog *prog, - unsigned long *symbol_start, - unsigned long *symbol_end) +static void +bpf_prog_ksym_set_addr(struct bpf_prog *prog) { const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog); unsigned long addr = (unsigned long)hdr; WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); - *symbol_start = addr; - *symbol_end = addr + hdr->pages * PAGE_SIZE; + prog->aux->ksym.start = (unsigned long) prog->bpf_func; + prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE; } -void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +static void +bpf_prog_ksym_set_name(struct bpf_prog *prog) { + char *sym = prog->aux->ksym.name; const char *end = sym + KSYM_NAME_LEN; const struct btf_type *type; const char *func_name; @@ -572,36 +572,27 @@ void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) *sym = 0; } -static __always_inline unsigned long -bpf_get_prog_addr_start(struct latch_tree_node *n) +static unsigned long bpf_get_ksym_start(struct latch_tree_node *n) { - unsigned long symbol_start, symbol_end; - const struct bpf_prog_aux *aux; - - aux = container_of(n, struct bpf_prog_aux, ksym_tnode); - bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); - - return symbol_start; + return container_of(n, struct bpf_ksym, tnode)->start; } static __always_inline bool bpf_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) { - return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b); + return bpf_get_ksym_start(a) < bpf_get_ksym_start(b); } static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) { unsigned long val = (unsigned long)key; - unsigned long symbol_start, symbol_end; - const struct bpf_prog_aux *aux; + const struct bpf_ksym *ksym; - aux = container_of(n, struct bpf_prog_aux, ksym_tnode); - bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + ksym = container_of(n, struct bpf_ksym, tnode); - if (val < symbol_start) + if (val < ksym->start) return -1; - if (val >= symbol_end) + if (val >= ksym->end) return 1; return 0; @@ -616,20 +607,29 @@ static DEFINE_SPINLOCK(bpf_lock); static LIST_HEAD(bpf_kallsyms); static struct latch_tree_root bpf_tree __cacheline_aligned; -static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) +void bpf_ksym_add(struct bpf_ksym *ksym) { - WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); - list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms); - latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); + spin_lock_bh(&bpf_lock); + WARN_ON_ONCE(!list_empty(&ksym->lnode)); + list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms); + latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops); + spin_unlock_bh(&bpf_lock); } -static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux) +static void __bpf_ksym_del(struct bpf_ksym *ksym) { - if (list_empty(&aux->ksym_lnode)) + if (list_empty(&ksym->lnode)) return; - latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); - list_del_rcu(&aux->ksym_lnode); + latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops); + list_del_rcu(&ksym->lnode); +} + +void bpf_ksym_del(struct bpf_ksym *ksym) +{ + spin_lock_bh(&bpf_lock); + __bpf_ksym_del(ksym); + spin_unlock_bh(&bpf_lock); } static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) @@ -639,8 +639,8 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { - return list_empty(&fp->aux->ksym_lnode) || - fp->aux->ksym_lnode.prev == LIST_POISON2; + return list_empty(&fp->aux->ksym.lnode) || + fp->aux->ksym.lnode.prev == LIST_POISON2; } void bpf_prog_kallsyms_add(struct bpf_prog *fp) @@ -649,9 +649,11 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp) !capable(CAP_SYS_ADMIN)) return; - spin_lock_bh(&bpf_lock); - bpf_prog_ksym_node_add(fp->aux); - spin_unlock_bh(&bpf_lock); + bpf_prog_ksym_set_addr(fp); + bpf_prog_ksym_set_name(fp); + fp->aux->ksym.prog = true; + + bpf_ksym_add(&fp->aux->ksym); } void bpf_prog_kallsyms_del(struct bpf_prog *fp) @@ -659,33 +661,30 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp) if (!bpf_prog_kallsyms_candidate(fp)) return; - spin_lock_bh(&bpf_lock); - bpf_prog_ksym_node_del(fp->aux); - spin_unlock_bh(&bpf_lock); + bpf_ksym_del(&fp->aux->ksym); } -static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) +static struct bpf_ksym *bpf_ksym_find(unsigned long addr) { struct latch_tree_node *n; n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); - return n ? - container_of(n, struct bpf_prog_aux, ksym_tnode)->prog : - NULL; + return n ? container_of(n, struct bpf_ksym, tnode) : NULL; } const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { - unsigned long symbol_start, symbol_end; - struct bpf_prog *prog; + struct bpf_ksym *ksym; char *ret = NULL; rcu_read_lock(); - prog = bpf_prog_kallsyms_find(addr); - if (prog) { - bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end); - bpf_get_prog_name(prog, sym); + ksym = bpf_ksym_find(addr); + if (ksym) { + unsigned long symbol_start = ksym->start; + unsigned long symbol_end = ksym->end; + + strncpy(sym, ksym->name, KSYM_NAME_LEN); ret = sym; if (size) @@ -703,19 +702,28 @@ bool is_bpf_text_address(unsigned long addr) bool ret; rcu_read_lock(); - ret = bpf_prog_kallsyms_find(addr) != NULL; + ret = bpf_ksym_find(addr) != NULL; rcu_read_unlock(); return ret; } +static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) +{ + struct bpf_ksym *ksym = bpf_ksym_find(addr); + + return ksym && ksym->prog ? + container_of(ksym, struct bpf_prog_aux, ksym)->prog : + NULL; +} + const struct exception_table_entry *search_bpf_extables(unsigned long addr) { const struct exception_table_entry *e = NULL; struct bpf_prog *prog; rcu_read_lock(); - prog = bpf_prog_kallsyms_find(addr); + prog = bpf_prog_ksym_find(addr); if (!prog) goto out; if (!prog->aux->num_exentries) @@ -730,7 +738,7 @@ out: int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { - struct bpf_prog_aux *aux; + struct bpf_ksym *ksym; unsigned int it = 0; int ret = -ERANGE; @@ -738,13 +746,13 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return ret; rcu_read_lock(); - list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) { + list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) { if (it++ != symnum) continue; - bpf_get_prog_name(aux->prog, sym); + strncpy(sym, ksym->name, KSYM_NAME_LEN); - *value = (unsigned long)aux->prog->bpf_func; + *value = ksym->start; *type = BPF_SYM_ELF_TYPE; ret = 0; @@ -2148,7 +2156,9 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; +const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; +const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index b3e5b214fed8..2444bd15cc2d 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -113,7 +113,7 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) noff = 0; } else { old = d->image + d->image_off; - noff = d->image_off ^ (BPF_IMAGE_SIZE / 2); + noff = d->image_off ^ (PAGE_SIZE / 2); } new = d->num_progs ? d->image + noff : NULL; @@ -140,9 +140,10 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, mutex_lock(&d->mutex); if (!d->image) { - d->image = bpf_image_alloc(); + d->image = bpf_jit_alloc_exec_page(); if (!d->image) goto out; + bpf_image_ksym_add(d->image, &d->ksym); } prev_num_progs = d->num_progs; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a1468e3f5af2..d541c8486c95 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -27,9 +27,62 @@ .map_delete_batch = \ generic_map_delete_batch +/* + * The bucket lock has two protection scopes: + * + * 1) Serializing concurrent operations from BPF programs on differrent + * CPUs + * + * 2) Serializing concurrent operations from BPF programs and sys_bpf() + * + * BPF programs can execute in any context including perf, kprobes and + * tracing. As there are almost no limits where perf, kprobes and tracing + * can be invoked from the lock operations need to be protected against + * deadlocks. Deadlocks can be caused by recursion and by an invocation in + * the lock held section when functions which acquire this lock are invoked + * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU + * variable bpf_prog_active, which prevents BPF programs attached to perf + * events, kprobes and tracing to be invoked before the prior invocation + * from one of these contexts completed. sys_bpf() uses the same mechanism + * by pinning the task to the current CPU and incrementing the recursion + * protection accross the map operation. + * + * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain + * operations like memory allocations (even with GFP_ATOMIC) from atomic + * contexts. This is required because even with GFP_ATOMIC the memory + * allocator calls into code pathes which acquire locks with long held lock + * sections. To ensure the deterministic behaviour these locks are regular + * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only + * true atomic contexts on an RT kernel are the low level hardware + * handling, scheduling, low level interrupt handling, NMIs etc. None of + * these contexts should ever do memory allocations. + * + * As regular device interrupt handlers and soft interrupts are forced into + * thread context, the existing code which does + * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*(); + * just works. + * + * In theory the BPF locks could be converted to regular spinlocks as well, + * but the bucket locks and percpu_freelist locks can be taken from + * arbitrary contexts (perf, kprobes, tracepoints) which are required to be + * atomic contexts even on RT. These mechanisms require preallocated maps, + * so there is no need to invoke memory allocations within the lock held + * sections. + * + * BPF maps which need dynamic allocation are only used from (forced) + * thread context on RT and can therefore use regular spinlocks which in + * turn allows to invoke memory allocations from the lock held section. + * + * On a non RT kernel this distinction is neither possible nor required. + * spinlock maps to raw_spinlock and the extra code is optimized out by the + * compiler. + */ struct bucket { struct hlist_nulls_head head; - raw_spinlock_t lock; + union { + raw_spinlock_t raw_lock; + spinlock_t lock; + }; }; struct bpf_htab { @@ -65,9 +118,54 @@ struct htab_elem { struct bpf_lru_node lru_node; }; u32 hash; - char key[0] __aligned(8); + char key[] __aligned(8); }; +static inline bool htab_is_prealloc(const struct bpf_htab *htab) +{ + return !(htab->map.map_flags & BPF_F_NO_PREALLOC); +} + +static inline bool htab_use_raw_lock(const struct bpf_htab *htab) +{ + return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab)); +} + +static void htab_init_buckets(struct bpf_htab *htab) +{ + unsigned i; + + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); + if (htab_use_raw_lock(htab)) + raw_spin_lock_init(&htab->buckets[i].raw_lock); + else + spin_lock_init(&htab->buckets[i].lock); + } +} + +static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab, + struct bucket *b) +{ + unsigned long flags; + + if (htab_use_raw_lock(htab)) + raw_spin_lock_irqsave(&b->raw_lock, flags); + else + spin_lock_irqsave(&b->lock, flags); + return flags; +} + +static inline void htab_unlock_bucket(const struct bpf_htab *htab, + struct bucket *b, + unsigned long flags) +{ + if (htab_use_raw_lock(htab)) + raw_spin_unlock_irqrestore(&b->raw_lock, flags); + else + spin_unlock_irqrestore(&b->lock, flags); +} + static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); static bool htab_is_lru(const struct bpf_htab *htab) @@ -82,11 +180,6 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } -static bool htab_is_prealloc(const struct bpf_htab *htab) -{ - return !(htab->map.map_flags & BPF_F_NO_PREALLOC); -} - static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { @@ -328,8 +421,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; - int err, i; u64 cost; + int err; htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) @@ -391,10 +484,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else htab->hashrnd = get_random_int(); - for (i = 0; i < htab->n_buckets; i++) { - INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - raw_spin_lock_init(&htab->buckets[i].lock); - } + htab_init_buckets(htab); if (prealloc) { err = prealloc_init(htab); @@ -602,7 +692,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) b = __select_bucket(htab, tgt_l->hash); head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { @@ -610,7 +700,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) break; } - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); return l == tgt_l; } @@ -686,15 +776,7 @@ static void htab_elem_free_rcu(struct rcu_head *head) struct htab_elem *l = container_of(head, struct htab_elem, rcu); struct bpf_htab *htab = l->htab; - /* must increment bpf_prog_active to avoid kprobe+bpf triggering while - * we're calling kfree, otherwise deadlock is possible if kprobes - * are placed somewhere inside of slub - */ - preempt_disable(); - __this_cpu_inc(bpf_prog_active); htab_elem_free(htab, l); - __this_cpu_dec(bpf_prog_active); - preempt_enable(); } static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) @@ -884,8 +966,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, */ } - /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l_old = lookup_elem_raw(head, hash, key, key_size); @@ -926,7 +1007,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, } ret = 0; err: - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); return ret; } @@ -964,8 +1045,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); - /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l_old = lookup_elem_raw(head, hash, key, key_size); @@ -984,7 +1064,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, ret = 0; err: - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); if (ret) bpf_lru_push_free(&htab->lru, &l_new->lru_node); @@ -1019,8 +1099,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1043,7 +1122,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); return ret; } @@ -1083,8 +1162,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, return -ENOMEM; } - /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1106,7 +1184,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); if (l_new) bpf_lru_push_free(&htab->lru, &l_new->lru_node); return ret; @@ -1144,7 +1222,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l = lookup_elem_raw(head, hash, key, key_size); @@ -1154,7 +1232,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) ret = 0; } - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); return ret; } @@ -1176,7 +1254,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l = lookup_elem_raw(head, hash, key, key_size); @@ -1185,7 +1263,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) ret = 0; } - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); if (l) bpf_lru_push_free(&htab->lru, &l->lru_node); return ret; @@ -1325,8 +1403,7 @@ alloc: } again: - preempt_disable(); - this_cpu_inc(bpf_prog_active); + bpf_disable_instrumentation(); rcu_read_lock(); again_nocopy: dst_key = keys; @@ -1335,7 +1412,7 @@ again_nocopy: head = &b->head; /* do not grab the lock unless need it (bucket_cnt > 0). */ if (locked) - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) @@ -1352,10 +1429,9 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); rcu_read_unlock(); - this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); goto after_loop; } @@ -1364,10 +1440,9 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); rcu_read_unlock(); - this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); kvfree(keys); kvfree(values); goto alloc; @@ -1418,7 +1493,7 @@ again_nocopy: dst_val += value_size; } - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); locked = false; while (node_to_free) { @@ -1437,8 +1512,7 @@ next_batch: } rcu_read_unlock(); - this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, key_size * bucket_cnt) || copy_to_user(uvalues + total * value_size, values, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d8b7b110a1c5..bafc53ddd350 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -12,6 +12,8 @@ #include <linux/filter.h> #include <linux/ctype.h> #include <linux/jiffies.h> +#include <linux/pid_namespace.h> +#include <linux/proc_ns.h> #include "../../lib/kstrtox.h" @@ -338,6 +340,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level) +{ + struct cgroup *cgrp = task_dfl_cgroup(current); + struct cgroup *ancestor; + + ancestor = cgroup_ancestor(cgrp, ancestor_level); + if (!ancestor) + return 0; + return cgroup_id(ancestor); +} + +const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { + .func = bpf_get_current_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + #ifdef CONFIG_CGROUP_BPF DECLARE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); @@ -499,3 +519,46 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg4_type = ARG_PTR_TO_LONG, }; #endif + +BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino, + struct bpf_pidns_info *, nsdata, u32, size) +{ + struct task_struct *task = current; + struct pid_namespace *pidns; + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_pidns_info))) + goto clear; + + if (unlikely((u64)(dev_t)dev != dev)) + goto clear; + + if (unlikely(!task)) + goto clear; + + pidns = task_active_pid_ns(task); + if (unlikely(!pidns)) { + err = -ENOENT; + goto clear; + } + + if (!ns_match(&pidns->ns, (dev_t)dev, ino)) + goto clear; + + nsdata->pid = task_pid_nr_ns(task, pidns); + nsdata->tgid = task_tgid_nr_ns(task, pidns); + return 0; +clear: + memset((void *)nsdata, 0, (size_t) size); + return err; +} + +const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { + .func = bpf_get_ns_current_pid_tgid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5e40e7fccc21..95087d9f4ed3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -25,6 +25,7 @@ enum bpf_type { BPF_TYPE_UNSPEC = 0, BPF_TYPE_PROG, BPF_TYPE_MAP, + BPF_TYPE_LINK, }; static void *bpf_any_get(void *raw, enum bpf_type type) @@ -36,6 +37,9 @@ static void *bpf_any_get(void *raw, enum bpf_type type) case BPF_TYPE_MAP: bpf_map_inc_with_uref(raw); break; + case BPF_TYPE_LINK: + bpf_link_inc(raw); + break; default: WARN_ON_ONCE(1); break; @@ -53,6 +57,9 @@ static void bpf_any_put(void *raw, enum bpf_type type) case BPF_TYPE_MAP: bpf_map_put_with_uref(raw); break; + case BPF_TYPE_LINK: + bpf_link_put(raw); + break; default: WARN_ON_ONCE(1); break; @@ -63,20 +70,32 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) { void *raw; - *type = BPF_TYPE_MAP; raw = bpf_map_get_with_uref(ufd); - if (IS_ERR(raw)) { + if (!IS_ERR(raw)) { + *type = BPF_TYPE_MAP; + return raw; + } + + raw = bpf_prog_get(ufd); + if (!IS_ERR(raw)) { *type = BPF_TYPE_PROG; - raw = bpf_prog_get(ufd); + return raw; } - return raw; + raw = bpf_link_get_from_fd(ufd); + if (!IS_ERR(raw)) { + *type = BPF_TYPE_LINK; + return raw; + } + + return ERR_PTR(-EINVAL); } static const struct inode_operations bpf_dir_iops; static const struct inode_operations bpf_prog_iops = { }; static const struct inode_operations bpf_map_iops = { }; +static const struct inode_operations bpf_link_iops = { }; static struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, @@ -114,6 +133,8 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) *type = BPF_TYPE_PROG; else if (inode->i_op == &bpf_map_iops) *type = BPF_TYPE_MAP; + else if (inode->i_op == &bpf_link_iops) + *type = BPF_TYPE_LINK; else return -EACCES; @@ -335,6 +356,12 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) &bpffs_map_fops : &bpffs_obj_fops); } +static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) +{ + return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, + &bpffs_obj_fops); +} + static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { @@ -411,6 +438,9 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw, case BPF_TYPE_MAP: ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); break; + case BPF_TYPE_LINK: + ret = vfs_mkobj(dentry, mode, bpf_mklink, raw); + break; default: ret = -EPERM; } @@ -487,6 +517,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags) ret = bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) ret = bpf_map_new_fd(raw, f_flags); + else if (type == BPF_TYPE_LINK) + ret = bpf_link_new_fd(raw); else return -ENOENT; @@ -504,6 +536,8 @@ static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type if (inode->i_op == &bpf_map_iops) return ERR_PTR(-EINVAL); + if (inode->i_op == &bpf_link_iops) + return ERR_PTR(-EINVAL); if (inode->i_op != &bpf_prog_iops) return ERR_PTR(-EACCES); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 56e6c75d354d..65c236cf341e 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -25,7 +25,7 @@ struct lpm_trie_node { struct lpm_trie_node __rcu *child[2]; u32 prefixlen; u32 flags; - u8 data[0]; + u8 data[]; }; struct lpm_trie { @@ -34,7 +34,7 @@ struct lpm_trie { size_t n_entries; size_t max_prefixlen; size_t data_size; - raw_spinlock_t lock; + spinlock_t lock; }; /* This trie implements a longest prefix match algorithm that can be used to @@ -315,7 +315,7 @@ static int trie_update_elem(struct bpf_map *map, if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - raw_spin_lock_irqsave(&trie->lock, irq_flags); + spin_lock_irqsave(&trie->lock, irq_flags); /* Allocate and fill a new node */ @@ -422,7 +422,7 @@ out: kfree(im_node); } - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + spin_unlock_irqrestore(&trie->lock, irq_flags); return ret; } @@ -442,7 +442,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - raw_spin_lock_irqsave(&trie->lock, irq_flags); + spin_lock_irqsave(&trie->lock, irq_flags); /* Walk the tree looking for an exact key/length match and keeping * track of the path we traverse. We will need to know the node @@ -518,7 +518,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) kfree_rcu(node, rcu); out: - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + spin_unlock_irqrestore(&trie->lock, irq_flags); return ret; } @@ -575,7 +575,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) if (ret) goto out_err; - raw_spin_lock_init(&trie->lock); + spin_lock_init(&trie->lock); return &trie->map; out_err: diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 6e090140b924..b367430e611c 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -25,12 +25,18 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s) free_percpu(s->freelist); } +static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, + struct pcpu_freelist_node *node) +{ + node->next = head->first; + head->first = node; +} + static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { raw_spin_lock(&head->lock); - node->next = head->first; - head->first = node; + pcpu_freelist_push_node(head, node); raw_spin_unlock(&head->lock); } @@ -56,21 +62,16 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, u32 nr_elems) { struct pcpu_freelist_head *head; - unsigned long flags; int i, cpu, pcpu_entries; pcpu_entries = nr_elems / num_possible_cpus() + 1; i = 0; - /* disable irq to workaround lockdep false positive - * in bpf usage pcpu_freelist_populate() will never race - * with pcpu_freelist_push() - */ - local_irq_save(flags); for_each_possible_cpu(cpu) { again: head = per_cpu_ptr(s->freelist, cpu); - ___pcpu_freelist_push(head, buf); + /* No locking required as this is not visible yet. */ + pcpu_freelist_push_node(head, buf); i++; buf += elem_size; if (i == nr_elems) @@ -78,7 +79,6 @@ again: if (i % pcpu_entries) goto again; } - local_irq_restore(flags); } struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 50c083ba978c..01badd3eda7a 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -305,11 +305,6 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, if (err) goto put_file_unlock; - /* Ensure reuse->reuseport_id is set */ - err = reuseport_get_id(reuse); - if (err < 0) - goto put_file_unlock; - WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3f958b90d914..db76339fe358 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -40,6 +40,9 @@ static void do_up_read(struct irq_work *entry) { struct stack_map_irq_work *work; + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) + return; + work = container_of(entry, struct stack_map_irq_work, irq_work); up_read_non_owner(work->sem); work->sem = NULL; @@ -288,10 +291,19 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, struct stack_map_irq_work *work = NULL; if (irqs_disabled()) { - work = this_cpu_ptr(&up_read_work); - if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) - /* cannot queue more up_read, fallback */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + work = this_cpu_ptr(&up_read_work); + if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) { + /* cannot queue more up_read, fallback */ + irq_work_busy = true; + } + } else { + /* + * PREEMPT_RT does not allow to trylock mmap sem in + * interrupt disabled context. Force the fallback code. + */ irq_work_busy = true; + } } /* diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a91ad518c050..d85f37239540 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,6 +25,7 @@ #include <linux/nospec.h> #include <linux/audit.h> #include <uapi/linux/btf.h> +#include <linux/bpf_lsm.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -171,11 +172,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, flags); } - /* must increment bpf_prog_active to avoid kprobe+bpf triggering from - * inside bpf map update or delete otherwise deadlocks are possible - */ - preempt_disable(); - __this_cpu_inc(bpf_prog_active); + bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_update(map, key, value, flags); @@ -206,8 +203,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, err = map->ops->map_update_elem(map, key, value, flags); rcu_read_unlock(); } - __this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); return err; @@ -222,8 +218,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, if (bpf_map_is_dev_bound(map)) return bpf_map_offload_lookup_elem(map, key, value); - preempt_disable(); - this_cpu_inc(bpf_prog_active); + bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); @@ -268,8 +263,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, rcu_read_unlock(); } - this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); return err; @@ -592,9 +586,7 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - bpf_map_inc_with_uref(map); - - if (vma->vm_flags & VM_WRITE) { + if (vma->vm_flags & VM_MAYWRITE) { mutex_lock(&map->freeze_mutex); map->writecnt++; mutex_unlock(&map->freeze_mutex); @@ -606,13 +598,11 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - if (vma->vm_flags & VM_WRITE) { + if (vma->vm_flags & VM_MAYWRITE) { mutex_lock(&map->freeze_mutex); map->writecnt--; mutex_unlock(&map->freeze_mutex); } - - bpf_map_put_with_uref(map); } static const struct vm_operations_struct bpf_map_default_vmops = { @@ -641,14 +631,16 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; vma->vm_private_data = map; + vma->vm_flags &= ~VM_MAYEXEC; + if (!(vma->vm_flags & VM_WRITE)) + /* disallow re-mapping with PROT_WRITE */ + vma->vm_flags &= ~VM_MAYWRITE; err = map->ops->map_mmap(map, vma); if (err) goto out; - bpf_map_inc_with_uref(map); - - if (vma->vm_flags & VM_WRITE) + if (vma->vm_flags & VM_MAYWRITE) map->writecnt++; out: mutex_unlock(&map->freeze_mutex); @@ -696,14 +688,15 @@ int bpf_get_file_flag(int flags) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. - * Return 0 on success and < 0 on error. +/* dst and src must have at least "size" number of bytes. + * Return strlen on success and < 0 on error. */ -static int bpf_obj_name_cpy(char *dst, const char *src) +int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) { - const char *end = src + BPF_OBJ_NAME_LEN; + const char *end = src + size; + const char *orig_src = src; - memset(dst, 0, BPF_OBJ_NAME_LEN); + memset(dst, 0, size); /* Copy all isalnum(), '_' and '.' chars. */ while (src < end && *src) { if (!isalnum(*src) && @@ -712,11 +705,11 @@ static int bpf_obj_name_cpy(char *dst, const char *src) *dst++ = *src++; } - /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ + /* No '\0' found in "size" number of bytes */ if (src == end) return -EINVAL; - return 0; + return src - orig_src; } int map_check_no_btf(const struct bpf_map *map, @@ -810,8 +803,9 @@ static int map_create(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = bpf_obj_name_cpy(map->name, attr->map_name); - if (err) + err = bpf_obj_name_cpy(map->name, attr->map_name, + sizeof(attr->map_name)); + if (err < 0) goto free_map; atomic64_set(&map->refcnt, 1); @@ -909,6 +903,21 @@ void bpf_map_inc_with_uref(struct bpf_map *map) } EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); +struct bpf_map *bpf_map_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_map *map; + + map = __bpf_map_get(f); + if (IS_ERR(map)) + return map; + + bpf_map_inc(map); + fdput(f); + + return map; +} + struct bpf_map *bpf_map_get_with_uref(u32 ufd) { struct fd f = fdget(ufd); @@ -1136,13 +1145,11 @@ static int map_delete_elem(union bpf_attr *attr) goto out; } - preempt_disable(); - __this_cpu_inc(bpf_prog_active); + bpf_disable_instrumentation(); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); - __this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); out: kfree(key); @@ -1254,13 +1261,11 @@ int generic_map_delete_batch(struct bpf_map *map, break; } - preempt_disable(); - __this_cpu_inc(bpf_prog_active); + bpf_disable_instrumentation(); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); - __this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); if (err) break; @@ -1510,6 +1515,11 @@ static int map_freeze(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + fdput(f); + return -ENOTSUPP; + } + mutex_lock(&map->freeze_mutex); if (map->writecnt) { @@ -1931,6 +1941,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, switch (prog_type) { case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: break; @@ -2093,8 +2104,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) goto free_prog; prog->aux->load_time = ktime_get_boottime_ns(); - err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); - if (err) + err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, + sizeof(attr->prog_name)); + if (err < 0) goto free_prog; /* run eBPF verifier */ @@ -2169,84 +2181,288 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -static int bpf_tracing_prog_release(struct inode *inode, struct file *filp) +void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, + struct bpf_prog *prog) { - struct bpf_prog *prog = filp->private_data; + atomic64_set(&link->refcnt, 1); + link->ops = ops; + link->prog = prog; +} - WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); - bpf_prog_put(prog); +/* Clean up bpf_link and corresponding anon_inode file and FD. After + * anon_inode is created, bpf_link can't be just kfree()'d due to deferred + * anon_inode's release() call. This helper manages marking bpf_link as + * defunct, releases anon_inode file and puts reserved FD. + */ +void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, + int link_fd) +{ + link->prog = NULL; + fput(link_file); + put_unused_fd(link_fd); +} + +void bpf_link_inc(struct bpf_link *link) +{ + atomic64_inc(&link->refcnt); +} + +/* bpf_link_free is guaranteed to be called from process context */ +static void bpf_link_free(struct bpf_link *link) +{ + if (link->prog) { + /* detach BPF program, clean up used resources */ + link->ops->release(link); + bpf_prog_put(link->prog); + } + /* free bpf_link and its containing memory */ + link->ops->dealloc(link); +} + +static void bpf_link_put_deferred(struct work_struct *work) +{ + struct bpf_link *link = container_of(work, struct bpf_link, work); + + bpf_link_free(link); +} + +/* bpf_link_put can be called from atomic context, but ensures that resources + * are freed from process context + */ +void bpf_link_put(struct bpf_link *link) +{ + if (!atomic64_dec_and_test(&link->refcnt)) + return; + + if (in_atomic()) { + INIT_WORK(&link->work, bpf_link_put_deferred); + schedule_work(&link->work); + } else { + bpf_link_free(link); + } +} + +static int bpf_link_release(struct inode *inode, struct file *filp) +{ + struct bpf_link *link = filp->private_data; + + bpf_link_put(link); return 0; } -static const struct file_operations bpf_tracing_prog_fops = { - .release = bpf_tracing_prog_release, +#ifdef CONFIG_PROC_FS +static const struct bpf_link_ops bpf_raw_tp_lops; +static const struct bpf_link_ops bpf_tracing_link_lops; + +static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_link *link = filp->private_data; + const struct bpf_prog *prog = link->prog; + char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; + const char *link_type; + + if (link->ops == &bpf_raw_tp_lops) + link_type = "raw_tracepoint"; + else if (link->ops == &bpf_tracing_link_lops) + link_type = "tracing"; +#ifdef CONFIG_CGROUP_BPF + else if (link->ops == &bpf_cgroup_link_lops) + link_type = "cgroup"; +#endif + else + link_type = "unknown"; + + bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); + seq_printf(m, + "link_type:\t%s\n" + "prog_tag:\t%s\n" + "prog_id:\t%u\n", + link_type, + prog_tag, + prog->aux->id); +} +#endif + +const struct file_operations bpf_link_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_link_show_fdinfo, +#endif + .release = bpf_link_release, .read = bpf_dummy_read, .write = bpf_dummy_write, }; +int bpf_link_new_fd(struct bpf_link *link) +{ + return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); +} + +/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but + * instead of immediately installing fd in fdtable, just reserve it and + * return. Caller then need to either install it with fd_install(fd, file) or + * release with put_unused_fd(fd). + * This is useful for cases when bpf_link attachment/detachment are + * complicated and expensive operations and should be delayed until all the fd + * reservation and anon_inode creation succeeds. + */ +struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd) +{ + struct file *file; + int fd; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return ERR_PTR(fd); + + file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); + if (IS_ERR(file)) { + put_unused_fd(fd); + return file; + } + + *reserved_fd = fd; + return file; +} + +struct bpf_link *bpf_link_get_from_fd(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_link *link; + + if (!f.file) + return ERR_PTR(-EBADF); + if (f.file->f_op != &bpf_link_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + link = f.file->private_data; + bpf_link_inc(link); + fdput(f); + + return link; +} + +struct bpf_tracing_link { + struct bpf_link link; +}; + +static void bpf_tracing_link_release(struct bpf_link *link) +{ + WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog)); +} + +static void bpf_tracing_link_dealloc(struct bpf_link *link) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + kfree(tr_link); +} + +static const struct bpf_link_ops bpf_tracing_link_lops = { + .release = bpf_tracing_link_release, + .dealloc = bpf_tracing_link_dealloc, +}; + static int bpf_tracing_prog_attach(struct bpf_prog *prog) { - int tr_fd, err; + struct bpf_tracing_link *link; + struct file *link_file; + int link_fd, err; - if (prog->expected_attach_type != BPF_TRACE_FENTRY && - prog->expected_attach_type != BPF_TRACE_FEXIT && - prog->type != BPF_PROG_TYPE_EXT) { + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + if (prog->expected_attach_type != BPF_TRACE_FENTRY && + prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->expected_attach_type != BPF_MODIFY_RETURN) { + err = -EINVAL; + goto out_put_prog; + } + break; + case BPF_PROG_TYPE_EXT: + if (prog->expected_attach_type != 0) { + err = -EINVAL; + goto out_put_prog; + } + break; + case BPF_PROG_TYPE_LSM: + if (prog->expected_attach_type != BPF_LSM_MAC) { + err = -EINVAL; + goto out_put_prog; + } + break; + default: err = -EINVAL; goto out_put_prog; } - err = bpf_trampoline_link_prog(prog); - if (err) + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; goto out_put_prog; + } + bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); - tr_fd = anon_inode_getfd("bpf-tracing-prog", &bpf_tracing_prog_fops, - prog, O_CLOEXEC); - if (tr_fd < 0) { - WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); - err = tr_fd; + link_file = bpf_link_new_file(&link->link, &link_fd); + if (IS_ERR(link_file)) { + kfree(link); + err = PTR_ERR(link_file); + goto out_put_prog; + } + + err = bpf_trampoline_link_prog(prog); + if (err) { + bpf_link_cleanup(&link->link, link_file, link_fd); goto out_put_prog; } - return tr_fd; + + fd_install(link_fd, link_file); + return link_fd; out_put_prog: bpf_prog_put(prog); return err; } -struct bpf_raw_tracepoint { +struct bpf_raw_tp_link { + struct bpf_link link; struct bpf_raw_event_map *btp; - struct bpf_prog *prog; }; -static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) +static void bpf_raw_tp_link_release(struct bpf_link *link) { - struct bpf_raw_tracepoint *raw_tp = filp->private_data; + struct bpf_raw_tp_link *raw_tp = + container_of(link, struct bpf_raw_tp_link, link); - if (raw_tp->prog) { - bpf_probe_unregister(raw_tp->btp, raw_tp->prog); - bpf_prog_put(raw_tp->prog); - } + bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); bpf_put_raw_tracepoint(raw_tp->btp); +} + +static void bpf_raw_tp_link_dealloc(struct bpf_link *link) +{ + struct bpf_raw_tp_link *raw_tp = + container_of(link, struct bpf_raw_tp_link, link); + kfree(raw_tp); - return 0; } -static const struct file_operations bpf_raw_tp_fops = { - .release = bpf_raw_tracepoint_release, - .read = bpf_dummy_read, - .write = bpf_dummy_write, +static const struct bpf_link_ops bpf_raw_tp_lops = { + .release = bpf_raw_tp_link_release, + .dealloc = bpf_raw_tp_link_dealloc, }; #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { - struct bpf_raw_tracepoint *raw_tp; + struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; + struct file *link_file; struct bpf_prog *prog; const char *tp_name; char buf[128]; - int tp_fd, err; + int link_fd, err; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) return -EINVAL; @@ -2255,16 +2471,10 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); - if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && - prog->type != BPF_PROG_TYPE_TRACING && - prog->type != BPF_PROG_TYPE_EXT && - prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { - err = -EINVAL; - goto out_put_prog; - } - - if (prog->type == BPF_PROG_TYPE_TRACING || - prog->type == BPF_PROG_TYPE_EXT) { + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_LSM: if (attr->raw_tracepoint.name) { /* The attach point for this category of programs * should be specified via btf_id during program load. @@ -2272,11 +2482,14 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -EINVAL; goto out_put_prog; } - if (prog->expected_attach_type == BPF_TRACE_RAW_TP) + if (prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_RAW_TP) { tp_name = prog->aux->attach_func_name; - else - return bpf_tracing_prog_attach(prog); - } else { + break; + } + return bpf_tracing_prog_attach(prog); + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, u64_to_user_ptr(attr->raw_tracepoint.name), sizeof(buf) - 1) < 0) { @@ -2285,6 +2498,10 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) } buf[sizeof(buf) - 1] = 0; tp_name = buf; + break; + default: + err = -EINVAL; + goto out_put_prog; } btp = bpf_get_raw_tracepoint(tp_name); @@ -2293,29 +2510,30 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) goto out_put_prog; } - raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); - if (!raw_tp) { + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { err = -ENOMEM; goto out_put_btp; } - raw_tp->btp = btp; - raw_tp->prog = prog; + bpf_link_init(&link->link, &bpf_raw_tp_lops, prog); + link->btp = btp; - err = bpf_probe_register(raw_tp->btp, prog); - if (err) - goto out_free_tp; + link_file = bpf_link_new_file(&link->link, &link_fd); + if (IS_ERR(link_file)) { + kfree(link); + err = PTR_ERR(link_file); + goto out_put_btp; + } - tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp, - O_CLOEXEC); - if (tp_fd < 0) { - bpf_probe_unregister(raw_tp->btp, prog); - err = tp_fd; - goto out_free_tp; + err = bpf_probe_register(link->btp, prog); + if (err) { + bpf_link_cleanup(&link->link, link_file, link_fd); + goto out_put_btp; } - return tp_fd; -out_free_tp: - kfree(raw_tp); + fd_install(link_fd, link_file); + return link_fd; + out_put_btp: bpf_put_raw_tracepoint(btp); out_put_prog: @@ -2340,36 +2558,18 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, } } -#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd - -#define BPF_F_ATTACH_MASK \ - (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE) - -static int bpf_prog_attach(const union bpf_attr *attr) +static enum bpf_prog_type +attach_type_to_prog_type(enum bpf_attach_type attach_type) { - enum bpf_prog_type ptype; - struct bpf_prog *prog; - int ret; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (CHECK_ATTR(BPF_PROG_ATTACH)) - return -EINVAL; - - if (attr->attach_flags & ~BPF_F_ATTACH_MASK) - return -EINVAL; - - switch (attr->attach_type) { + switch (attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: - ptype = BPF_PROG_TYPE_CGROUP_SKB; + return BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: - ptype = BPF_PROG_TYPE_CGROUP_SOCK; - break; + return BPF_PROG_TYPE_CGROUP_SOCK; case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: @@ -2378,37 +2578,53 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: - ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; - break; + return BPF_PROG_TYPE_CGROUP_SOCK_ADDR; case BPF_CGROUP_SOCK_OPS: - ptype = BPF_PROG_TYPE_SOCK_OPS; - break; + return BPF_PROG_TYPE_SOCK_OPS; case BPF_CGROUP_DEVICE: - ptype = BPF_PROG_TYPE_CGROUP_DEVICE; - break; + return BPF_PROG_TYPE_CGROUP_DEVICE; case BPF_SK_MSG_VERDICT: - ptype = BPF_PROG_TYPE_SK_MSG; - break; + return BPF_PROG_TYPE_SK_MSG; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - ptype = BPF_PROG_TYPE_SK_SKB; - break; + return BPF_PROG_TYPE_SK_SKB; case BPF_LIRC_MODE2: - ptype = BPF_PROG_TYPE_LIRC_MODE2; - break; + return BPF_PROG_TYPE_LIRC_MODE2; case BPF_FLOW_DISSECTOR: - ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; - break; + return BPF_PROG_TYPE_FLOW_DISSECTOR; case BPF_CGROUP_SYSCTL: - ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; - break; + return BPF_PROG_TYPE_CGROUP_SYSCTL; case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: - ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; - break; + return BPF_PROG_TYPE_CGROUP_SOCKOPT; default: - return -EINVAL; + return BPF_PROG_TYPE_UNSPEC; } +} + +#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd + +#define BPF_F_ATTACH_MASK \ + (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE) + +static int bpf_prog_attach(const union bpf_attr *attr) +{ + enum bpf_prog_type ptype; + struct bpf_prog *prog; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_ATTACH)) + return -EINVAL; + + if (attr->attach_flags & ~BPF_F_ATTACH_MASK) + return -EINVAL; + + ptype = attach_type_to_prog_type(attr->attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC) + return -EINVAL; prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) @@ -2430,8 +2646,17 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_FLOW_DISSECTOR: ret = skb_flow_dissector_bpf_prog_attach(attr, prog); break; - default: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: ret = cgroup_bpf_prog_attach(attr, ptype, prog); + break; + default: + ret = -EINVAL; } if (ret) @@ -2451,53 +2676,27 @@ static int bpf_prog_detach(const union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; - switch (attr->attach_type) { - case BPF_CGROUP_INET_INGRESS: - case BPF_CGROUP_INET_EGRESS: - ptype = BPF_PROG_TYPE_CGROUP_SKB; - break; - case BPF_CGROUP_INET_SOCK_CREATE: - case BPF_CGROUP_INET4_POST_BIND: - case BPF_CGROUP_INET6_POST_BIND: - ptype = BPF_PROG_TYPE_CGROUP_SOCK; - break; - case BPF_CGROUP_INET4_BIND: - case BPF_CGROUP_INET6_BIND: - case BPF_CGROUP_INET4_CONNECT: - case BPF_CGROUP_INET6_CONNECT: - case BPF_CGROUP_UDP4_SENDMSG: - case BPF_CGROUP_UDP6_SENDMSG: - case BPF_CGROUP_UDP4_RECVMSG: - case BPF_CGROUP_UDP6_RECVMSG: - ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; - break; - case BPF_CGROUP_SOCK_OPS: - ptype = BPF_PROG_TYPE_SOCK_OPS; - break; - case BPF_CGROUP_DEVICE: - ptype = BPF_PROG_TYPE_CGROUP_DEVICE; - break; - case BPF_SK_MSG_VERDICT: - return sock_map_get_from_fd(attr, NULL); - case BPF_SK_SKB_STREAM_PARSER: - case BPF_SK_SKB_STREAM_VERDICT: + ptype = attach_type_to_prog_type(attr->attach_type); + + switch (ptype) { + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_SK_SKB: return sock_map_get_from_fd(attr, NULL); - case BPF_LIRC_MODE2: + case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); - case BPF_FLOW_DISSECTOR: + case BPF_PROG_TYPE_FLOW_DISSECTOR: return skb_flow_dissector_bpf_prog_detach(attr); - case BPF_CGROUP_SYSCTL: - ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; - break; - case BPF_CGROUP_GETSOCKOPT: - case BPF_CGROUP_SETSOCKOPT: - ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; - break; + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + return cgroup_bpf_prog_detach(attr, ptype); default: return -EINVAL; } - - return cgroup_bpf_prog_detach(attr, ptype); } #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt @@ -2531,7 +2730,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_SYSCTL: case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: - break; + return cgroup_bpf_prog_query(attr, uattr); case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: @@ -2539,8 +2738,6 @@ static int bpf_prog_query(const union bpf_attr *attr, default: return -EINVAL; } - - return cgroup_bpf_prog_query(attr, uattr); } #define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out @@ -2787,7 +2984,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, union bpf_attr __user *uattr) { struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); - struct bpf_prog_info info = {}; + struct bpf_prog_info info; u32 info_len = attr->info.info_len; struct bpf_prog_stats stats; char __user *uinsns; @@ -2799,6 +2996,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return err; info_len = min_t(u32, sizeof(info), info_len); + memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_len)) return -EFAULT; @@ -3062,7 +3260,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, union bpf_attr __user *uattr) { struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); - struct bpf_map_info info = {}; + struct bpf_map_info info; u32 info_len = attr->info.info_len; int err; @@ -3071,6 +3269,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return err; info_len = min_t(u32, sizeof(info), info_len); + memset(&info, 0, sizeof(info)); info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; @@ -3262,15 +3461,21 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (err) goto out; - if (file->f_op == &bpf_raw_tp_fops) { - struct bpf_raw_tracepoint *raw_tp = file->private_data; - struct bpf_raw_event_map *btp = raw_tp->btp; + if (file->f_op == &bpf_link_fops) { + struct bpf_link *link = file->private_data; - err = bpf_task_fd_query_copy(attr, uattr, - raw_tp->prog->aux->id, - BPF_FD_TYPE_RAW_TRACEPOINT, - btp->tp->name, 0, 0); - goto put_file; + if (link->ops == &bpf_raw_tp_lops) { + struct bpf_raw_tp_link *raw_tp = + container_of(link, struct bpf_raw_tp_link, link); + struct bpf_raw_event_map *btp = raw_tp->btp; + + err = bpf_task_fd_query_copy(attr, uattr, + raw_tp->link.prog->aux->id, + BPF_FD_TYPE_RAW_TRACEPOINT, + btp->tp->name, 0, 0); + goto put_file; + } + goto out_not_supp; } event = perf_get_event(file); @@ -3290,6 +3495,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, goto put_file; } +out_not_supp: err = -ENOTSUPP; put_file: fput(file); @@ -3352,9 +3558,107 @@ err_put: return err; } +#define BPF_LINK_CREATE_LAST_FIELD link_create.flags +static int link_create(union bpf_attr *attr) +{ + enum bpf_prog_type ptype; + struct bpf_prog *prog; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_LINK_CREATE)) + return -EINVAL; + + ptype = attach_type_to_prog_type(attr->link_create.attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC) + return -EINVAL; + + prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + ret = bpf_prog_attach_check_attach_type(prog, + attr->link_create.attach_type); + if (ret) + goto err_out; + + switch (ptype) { + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + ret = cgroup_bpf_link_attach(attr, prog); + break; + default: + ret = -EINVAL; + } + +err_out: + if (ret < 0) + bpf_prog_put(prog); + return ret; +} + +#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd + +static int link_update(union bpf_attr *attr) +{ + struct bpf_prog *old_prog = NULL, *new_prog; + struct bpf_link *link; + u32 flags; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_LINK_UPDATE)) + return -EINVAL; + + flags = attr->link_update.flags; + if (flags & ~BPF_F_REPLACE) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->link_update.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + new_prog = bpf_prog_get(attr->link_update.new_prog_fd); + if (IS_ERR(new_prog)) + return PTR_ERR(new_prog); + + if (flags & BPF_F_REPLACE) { + old_prog = bpf_prog_get(attr->link_update.old_prog_fd); + if (IS_ERR(old_prog)) { + ret = PTR_ERR(old_prog); + old_prog = NULL; + goto out_put_progs; + } + } + +#ifdef CONFIG_CGROUP_BPF + if (link->ops == &bpf_cgroup_link_lops) { + ret = cgroup_bpf_replace(link, old_prog, new_prog); + goto out_put_progs; + } +#endif + ret = -EINVAL; + +out_put_progs: + if (old_prog) + bpf_prog_put(old_prog); + if (ret) + bpf_prog_put(new_prog); + return ret; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { - union bpf_attr attr = {}; + union bpf_attr attr; int err; if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) @@ -3366,6 +3670,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz size = min_t(u32, size, sizeof(attr)); /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + memset(&attr, 0, sizeof(attr)); if (copy_from_user(&attr, uattr, size) != 0) return -EFAULT; @@ -3462,6 +3767,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_DELETE_BATCH: err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH); break; + case BPF_LINK_CREATE: + err = link_create(&attr); + break; + case BPF_LINK_UPDATE: + err = link_update(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 7ae5dddd1fe6..3b495773de5a 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -9,15 +9,15 @@ #include <linux/sysfs.h> /* See scripts/link-vmlinux.sh, gen_btf() func for details */ -extern char __weak _binary__btf_vmlinux_bin_start[]; -extern char __weak _binary__btf_vmlinux_bin_end[]; +extern char __weak __start_BTF[]; +extern char __weak __stop_BTF[]; static ssize_t btf_vmlinux_read(struct file *file, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) { - memcpy(buf, _binary__btf_vmlinux_bin_start + off, len); + memcpy(buf, __start_BTF + off, len); return len; } @@ -30,15 +30,14 @@ static struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { - if (!_binary__btf_vmlinux_bin_start) + if (!__start_BTF) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); if (!btf_kobj) return -ENOMEM; - bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end - - _binary__btf_vmlinux_bin_start; + bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); } diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index d4f335a9a899..ceac5281bd31 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -194,3 +194,18 @@ int tnum_sbin(char *str, size_t size, struct tnum a) str[min(size - 1, (size_t)64)] = 0; return 64; } + +struct tnum tnum_subreg(struct tnum a) +{ + return tnum_cast(a, 4); +} + +struct tnum tnum_clear_subreg(struct tnum a) +{ + return tnum_lshift(tnum_rshift(a, 32), 32); +} + +struct tnum tnum_const_subreg(struct tnum a, u32 value) +{ + return tnum_or(tnum_clear_subreg(a), tnum_const(value)); +} diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 6b264a92064b..9be85aa4ec5f 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -5,6 +5,8 @@ #include <linux/filter.h> #include <linux/ftrace.h> #include <linux/rbtree_latch.h> +#include <linux/perf_event.h> +#include <linux/btf.h> /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -17,12 +19,11 @@ const struct bpf_prog_ops bpf_extension_prog_ops = { #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; -static struct latch_tree_root image_tree __cacheline_aligned; -/* serializes access to trampoline_table and image_tree */ +/* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); -static void *bpf_jit_alloc_exec_page(void) +void *bpf_jit_alloc_exec_page(void) { void *image; @@ -38,62 +39,28 @@ static void *bpf_jit_alloc_exec_page(void) return image; } -static __always_inline bool image_tree_less(struct latch_tree_node *a, - struct latch_tree_node *b) +void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) { - struct bpf_image *ia = container_of(a, struct bpf_image, tnode); - struct bpf_image *ib = container_of(b, struct bpf_image, tnode); - - return ia < ib; -} - -static __always_inline int image_tree_comp(void *addr, struct latch_tree_node *n) -{ - void *image = container_of(n, struct bpf_image, tnode); - - if (addr < image) - return -1; - if (addr >= image + PAGE_SIZE) - return 1; - - return 0; -} - -static const struct latch_tree_ops image_tree_ops = { - .less = image_tree_less, - .comp = image_tree_comp, -}; - -static void *__bpf_image_alloc(bool lock) -{ - struct bpf_image *image; - - image = bpf_jit_alloc_exec_page(); - if (!image) - return NULL; - - if (lock) - mutex_lock(&trampoline_mutex); - latch_tree_insert(&image->tnode, &image_tree, &image_tree_ops); - if (lock) - mutex_unlock(&trampoline_mutex); - return image->data; + ksym->start = (unsigned long) data; + ksym->end = ksym->start + PAGE_SIZE; + bpf_ksym_add(ksym); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, + PAGE_SIZE, false, ksym->name); } -void *bpf_image_alloc(void) +void bpf_image_ksym_del(struct bpf_ksym *ksym) { - return __bpf_image_alloc(true); + bpf_ksym_del(ksym); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, + PAGE_SIZE, true, ksym->name); } -bool is_bpf_image_address(unsigned long addr) +static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr) { - bool ret; - - rcu_read_lock(); - ret = latch_tree_find((void *) addr, &image_tree, &image_tree_ops) != NULL; - rcu_read_unlock(); + struct bpf_ksym *ksym = &tr->ksym; - return ret; + snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key); + bpf_image_ksym_add(tr->image, ksym); } struct bpf_trampoline *bpf_trampoline_lookup(u64 key) @@ -116,7 +83,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) goto out; /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ - image = __bpf_image_alloc(false); + image = bpf_jit_alloc_exec_page(); if (!image) { kfree(tr); tr = NULL; @@ -131,6 +98,8 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); tr->image = image; + INIT_LIST_HEAD_RCU(&tr->ksym.lnode); + bpf_trampoline_ksym_add(tr); out: mutex_unlock(&trampoline_mutex); return tr; @@ -190,40 +159,50 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) return ret; } -/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 - * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 - */ -#define BPF_MAX_TRAMP_PROGS 40 +static struct bpf_tramp_progs * +bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total) +{ + const struct bpf_prog_aux *aux; + struct bpf_tramp_progs *tprogs; + struct bpf_prog **progs; + int kind; + + *total = 0; + tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); + if (!tprogs) + return ERR_PTR(-ENOMEM); + + for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { + tprogs[kind].nr_progs = tr->progs_cnt[kind]; + *total += tr->progs_cnt[kind]; + progs = tprogs[kind].progs; + + hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) + *progs++ = aux->prog; + } + return tprogs; +} static int bpf_trampoline_update(struct bpf_trampoline *tr) { - void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2; - void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2; - struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS]; - int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY]; - int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT]; - struct bpf_prog **progs, **fentry, **fexit; + void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2; + void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2; + struct bpf_tramp_progs *tprogs; u32 flags = BPF_TRAMP_F_RESTORE_REGS; - struct bpf_prog_aux *aux; - int err; + int err, total; + + tprogs = bpf_trampoline_get_progs(tr, &total); + if (IS_ERR(tprogs)) + return PTR_ERR(tprogs); - if (fentry_cnt + fexit_cnt == 0) { + if (total == 0) { err = unregister_fentry(tr, old_image); tr->selector = 0; goto out; } - /* populate fentry progs */ - fentry = progs = progs_to_run; - hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FENTRY], tramp_hlist) - *progs++ = aux->prog; - - /* populate fexit progs */ - fexit = progs; - hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FEXIT], tramp_hlist) - *progs++ = aux->prog; - - if (fexit_cnt) + if (tprogs[BPF_TRAMP_FEXIT].nr_progs || + tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; /* Though the second half of trampoline page is unused a task could be @@ -232,12 +211,11 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) * preempted task. Hence wait for tasks to voluntarily schedule or go * to userspace. */ + synchronize_rcu_tasks(); - err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2, - &tr->func.model, flags, - fentry, fentry_cnt, - fexit, fexit_cnt, + err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, + &tr->func.model, flags, tprogs, tr->func.addr); if (err < 0) goto out; @@ -252,16 +230,27 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) goto out; tr->selector++; out: + kfree(tprogs); return err; } -static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t) +static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) { - switch (t) { + switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; + case BPF_MODIFY_RETURN: + return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; + case BPF_LSM_MAC: + if (!prog->aux->attach_func_proto->type) + /* The function returns void, we cannot modify its + * return value. + */ + return BPF_TRAMP_FEXIT; + else + return BPF_TRAMP_MODIFY_RETURN; default: return BPF_TRAMP_REPLACE; } @@ -275,7 +264,7 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog) int cnt; tr = prog->aux->trampoline; - kind = bpf_attach_type_to_tramp(prog->expected_attach_type); + kind = bpf_attach_type_to_tramp(prog); mutex_lock(&tr->mutex); if (tr->extension_prog) { /* cannot attach fentry/fexit if extension prog is attached. @@ -325,7 +314,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog) int err; tr = prog->aux->trampoline; - kind = bpf_attach_type_to_tramp(prog->expected_attach_type); + kind = bpf_attach_type_to_tramp(prog); mutex_lock(&tr->mutex); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); @@ -344,8 +333,6 @@ out: void bpf_trampoline_put(struct bpf_trampoline *tr) { - struct bpf_image *image; - if (!tr) return; mutex_lock(&trampoline_mutex); @@ -356,35 +343,37 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) goto out; if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; - image = container_of(tr->image, struct bpf_image, data); - latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops); + bpf_image_ksym_del(&tr->ksym); /* wait for tasks to get out of trampoline before freeing it */ synchronize_rcu_tasks(); - bpf_jit_free_exec(image); + bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); kfree(tr); out: mutex_unlock(&trampoline_mutex); } -/* The logic is similar to BPF_PROG_RUN, but with explicit rcu and preempt that - * are needed for trampoline. The macro is split into +/* The logic is similar to BPF_PROG_RUN, but with an explicit + * rcu_read_lock() and migrate_disable() which are required + * for the trampoline. The macro is split into * call _bpf_prog_enter * call prog->bpf_func * call __bpf_prog_exit */ u64 notrace __bpf_prog_enter(void) + __acquires(RCU) { u64 start = 0; rcu_read_lock(); - preempt_disable(); + migrate_disable(); if (static_branch_unlikely(&bpf_stats_enabled_key)) start = sched_clock(); return start; } void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) + __releases(RCU) { struct bpf_prog_stats *stats; @@ -401,15 +390,14 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) stats->nsecs += sched_clock() - start; u64_stats_update_end(&stats->syncp); } - preempt_enable(); + migrate_enable(); rcu_read_unlock(); } int __weak arch_prepare_bpf_trampoline(void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_prog **fentry_progs, int fentry_cnt, - struct bpf_prog **fexit_progs, int fexit_cnt, + struct bpf_tramp_progs *tprogs, void *orig_call) { return -ENOTSUPP; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1cc945daa9c8..38cfcf701eeb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19,6 +19,8 @@ #include <linux/sort.h> #include <linux/perf_event.h> #include <linux/ctype.h> +#include <linux/error-injection.h> +#include <linux/bpf_lsm.h> #include "disasm.h" @@ -227,8 +229,7 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; - s64 msize_smax_value; - u64 msize_umax_value; + u64 msize_max_value; int ref_obj_id; int func_id; u32 btf_id; @@ -549,6 +550,22 @@ static void print_verifier_state(struct bpf_verifier_env *env, tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, ",var_off=%s", tn_buf); } + if (reg->s32_min_value != reg->smin_value && + reg->s32_min_value != S32_MIN) + verbose(env, ",s32_min_value=%d", + (int)(reg->s32_min_value)); + if (reg->s32_max_value != reg->smax_value && + reg->s32_max_value != S32_MAX) + verbose(env, ",s32_max_value=%d", + (int)(reg->s32_max_value)); + if (reg->u32_min_value != reg->umin_value && + reg->u32_min_value != U32_MIN) + verbose(env, ",u32_min_value=%d", + (int)(reg->u32_min_value)); + if (reg->u32_max_value != reg->umax_value && + reg->u32_max_value != U32_MAX) + verbose(env, ",u32_max_value=%d", + (int)(reg->u32_max_value)); } verbose(env, ")"); } @@ -923,6 +940,20 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) reg->smax_value = (s64)imm; reg->umin_value = imm; reg->umax_value = imm; + + reg->s32_min_value = (s32)imm; + reg->s32_max_value = (s32)imm; + reg->u32_min_value = (u32)imm; + reg->u32_max_value = (u32)imm; +} + +static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm) +{ + reg->var_off = tnum_const_subreg(reg->var_off, imm); + reg->s32_min_value = (s32)imm; + reg->s32_max_value = (s32)imm; + reg->u32_min_value = (u32)imm; + reg->u32_max_value = (u32)imm; } /* Mark the 'variable offset' part of a register as zero. This should be @@ -977,8 +1008,52 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, tnum_equals_const(reg->var_off, 0); } -/* Attempts to improve min/max values based on var_off information */ -static void __update_reg_bounds(struct bpf_reg_state *reg) +/* Reset the min/max bounds of a register */ +static void __mark_reg_unbounded(struct bpf_reg_state *reg) +{ + reg->smin_value = S64_MIN; + reg->smax_value = S64_MAX; + reg->umin_value = 0; + reg->umax_value = U64_MAX; + + reg->s32_min_value = S32_MIN; + reg->s32_max_value = S32_MAX; + reg->u32_min_value = 0; + reg->u32_max_value = U32_MAX; +} + +static void __mark_reg64_unbounded(struct bpf_reg_state *reg) +{ + reg->smin_value = S64_MIN; + reg->smax_value = S64_MAX; + reg->umin_value = 0; + reg->umax_value = U64_MAX; +} + +static void __mark_reg32_unbounded(struct bpf_reg_state *reg) +{ + reg->s32_min_value = S32_MIN; + reg->s32_max_value = S32_MAX; + reg->u32_min_value = 0; + reg->u32_max_value = U32_MAX; +} + +static void __update_reg32_bounds(struct bpf_reg_state *reg) +{ + struct tnum var32_off = tnum_subreg(reg->var_off); + + /* min signed is max(sign bit) | min(other bits) */ + reg->s32_min_value = max_t(s32, reg->s32_min_value, + var32_off.value | (var32_off.mask & S32_MIN)); + /* max signed is min(sign bit) | max(other bits) */ + reg->s32_max_value = min_t(s32, reg->s32_max_value, + var32_off.value | (var32_off.mask & S32_MAX)); + reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value); + reg->u32_max_value = min(reg->u32_max_value, + (u32)(var32_off.value | var32_off.mask)); +} + +static void __update_reg64_bounds(struct bpf_reg_state *reg) { /* min signed is max(sign bit) | min(other bits) */ reg->smin_value = max_t(s64, reg->smin_value, @@ -991,8 +1066,48 @@ static void __update_reg_bounds(struct bpf_reg_state *reg) reg->var_off.value | reg->var_off.mask); } +static void __update_reg_bounds(struct bpf_reg_state *reg) +{ + __update_reg32_bounds(reg); + __update_reg64_bounds(reg); +} + /* Uses signed min/max values to inform unsigned, and vice-versa */ -static void __reg_deduce_bounds(struct bpf_reg_state *reg) +static void __reg32_deduce_bounds(struct bpf_reg_state *reg) +{ + /* Learn sign from signed bounds. + * If we cannot cross the sign boundary, then signed and unsigned bounds + * are the same, so combine. This works even in the negative case, e.g. + * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. + */ + if (reg->s32_min_value >= 0 || reg->s32_max_value < 0) { + reg->s32_min_value = reg->u32_min_value = + max_t(u32, reg->s32_min_value, reg->u32_min_value); + reg->s32_max_value = reg->u32_max_value = + min_t(u32, reg->s32_max_value, reg->u32_max_value); + return; + } + /* Learn sign from unsigned bounds. Signed bounds cross the sign + * boundary, so we must be careful. + */ + if ((s32)reg->u32_max_value >= 0) { + /* Positive. We can't learn anything from the smin, but smax + * is positive, hence safe. + */ + reg->s32_min_value = reg->u32_min_value; + reg->s32_max_value = reg->u32_max_value = + min_t(u32, reg->s32_max_value, reg->u32_max_value); + } else if ((s32)reg->u32_min_value < 0) { + /* Negative. We can't learn anything from the smax, but smin + * is negative, hence safe. + */ + reg->s32_min_value = reg->u32_min_value = + max_t(u32, reg->s32_min_value, reg->u32_min_value); + reg->s32_max_value = reg->u32_max_value; + } +} + +static void __reg64_deduce_bounds(struct bpf_reg_state *reg) { /* Learn sign from signed bounds. * If we cannot cross the sign boundary, then signed and unsigned bounds @@ -1026,32 +1141,106 @@ static void __reg_deduce_bounds(struct bpf_reg_state *reg) } } +static void __reg_deduce_bounds(struct bpf_reg_state *reg) +{ + __reg32_deduce_bounds(reg); + __reg64_deduce_bounds(reg); +} + /* Attempts to improve var_off based on unsigned min/max information */ static void __reg_bound_offset(struct bpf_reg_state *reg) { - reg->var_off = tnum_intersect(reg->var_off, - tnum_range(reg->umin_value, - reg->umax_value)); + struct tnum var64_off = tnum_intersect(reg->var_off, + tnum_range(reg->umin_value, + reg->umax_value)); + struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off), + tnum_range(reg->u32_min_value, + reg->u32_max_value)); + + reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); +} + +static void __reg_assign_32_into_64(struct bpf_reg_state *reg) +{ + reg->umin_value = reg->u32_min_value; + reg->umax_value = reg->u32_max_value; + /* Attempt to pull 32-bit signed bounds into 64-bit bounds + * but must be positive otherwise set to worse case bounds + * and refine later from tnum. + */ + if (reg->s32_min_value > 0) + reg->smin_value = reg->s32_min_value; + else + reg->smin_value = 0; + if (reg->s32_max_value > 0) + reg->smax_value = reg->s32_max_value; + else + reg->smax_value = U32_MAX; } -static void __reg_bound_offset32(struct bpf_reg_state *reg) +static void __reg_combine_32_into_64(struct bpf_reg_state *reg) { - u64 mask = 0xffffFFFF; - struct tnum range = tnum_range(reg->umin_value & mask, - reg->umax_value & mask); - struct tnum lo32 = tnum_cast(reg->var_off, 4); - struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32); + /* special case when 64-bit register has upper 32-bit register + * zeroed. Typically happens after zext or <<32, >>32 sequence + * allowing us to use 32-bit bounds directly, + */ + if (tnum_equals_const(tnum_clear_subreg(reg->var_off), 0)) { + __reg_assign_32_into_64(reg); + } else { + /* Otherwise the best we can do is push lower 32bit known and + * unknown bits into register (var_off set from jmp logic) + * then learn as much as possible from the 64-bit tnum + * known and unknown bits. The previous smin/smax bounds are + * invalid here because of jmp32 compare so mark them unknown + * so they do not impact tnum bounds calculation. + */ + __mark_reg64_unbounded(reg); + __update_reg_bounds(reg); + } - reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range)); + /* Intersecting with the old var_off might have improved our bounds + * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), + * then new var_off is (0; 0x7f...fc) which improves our umax. + */ + __reg_deduce_bounds(reg); + __reg_bound_offset(reg); + __update_reg_bounds(reg); } -/* Reset the min/max bounds of a register */ -static void __mark_reg_unbounded(struct bpf_reg_state *reg) +static bool __reg64_bound_s32(s64 a) { - reg->smin_value = S64_MIN; - reg->smax_value = S64_MAX; - reg->umin_value = 0; - reg->umax_value = U64_MAX; + if (a > S32_MIN && a < S32_MAX) + return true; + return false; +} + +static bool __reg64_bound_u32(u64 a) +{ + if (a > U32_MIN && a < U32_MAX) + return true; + return false; +} + +static void __reg_combine_64_into_32(struct bpf_reg_state *reg) +{ + __mark_reg32_unbounded(reg); + + if (__reg64_bound_s32(reg->smin_value)) + reg->s32_min_value = (s32)reg->smin_value; + if (__reg64_bound_s32(reg->smax_value)) + reg->s32_max_value = (s32)reg->smax_value; + if (__reg64_bound_u32(reg->umin_value)) + reg->u32_min_value = (u32)reg->umin_value; + if (__reg64_bound_u32(reg->umax_value)) + reg->u32_max_value = (u32)reg->umax_value; + + /* Intersecting with the old var_off might have improved our bounds + * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), + * then new var_off is (0; 0x7f...fc) which improves our umax. + */ + __reg_deduce_bounds(reg); + __reg_bound_offset(reg); + __update_reg_bounds(reg); } /* Mark a register as having a completely unknown (scalar) value. */ @@ -1066,8 +1255,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? - true : false; + reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks; __mark_reg_unbounded(reg); } @@ -2784,6 +2972,12 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env, return 0; } +/* BPF architecture zero extends alu32 ops into 64-bit registesr */ +static void zext_32_to_64(struct bpf_reg_state *reg) +{ + reg->var_off = tnum_subreg(reg->var_off); + __reg_assign_32_into_64(reg); +} /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE @@ -2806,6 +3000,14 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) } reg->smin_value = reg->umin_value; reg->smax_value = reg->umax_value; + + /* If size is smaller than 32bit register the 32bit register + * values are also truncated so we push 64-bit bounds into + * 32-bit bounds. Above were truncated < 32-bits already. + */ + if (size >= 4) + return; + __reg_combine_64_into_32(reg); } static bool bpf_map_is_rdonly(const struct bpf_map *map) @@ -3460,13 +3662,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = CONST_PTR_TO_MAP; if (type != expected_type) goto err_type; - } else if (arg_type == ARG_PTR_TO_CTX) { + } else if (arg_type == ARG_PTR_TO_CTX || + arg_type == ARG_PTR_TO_CTX_OR_NULL) { expected_type = PTR_TO_CTX; - if (type != expected_type) - goto err_type; - err = check_ctx_reg(env, reg, regno); - if (err < 0) - return err; + if (!(register_is_null(reg) && + arg_type == ARG_PTR_TO_CTX_OR_NULL)) { + if (type != expected_type) + goto err_type; + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; + } } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { expected_type = PTR_TO_SOCK_COMMON; /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ @@ -3576,11 +3782,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - /* remember the mem_size which may be used later - * to refine return values. + /* This is used to refine r0 return value bounds for helpers + * that enforce this value as an upper bound on return values. + * See do_refine_retval_range() for helpers that can refine + * the return value. C type of helper is u32 so we pull register + * bound from umax_value however, if negative verifier errors + * out. Only upper bounds can be learned because retval is an + * int type and negative retvals are allowed. */ - meta->msize_smax_value = reg->smax_value; - meta->msize_umax_value = reg->umax_value; + meta->msize_max_value = reg->umax_value; /* The register is SCALAR_VALUE; the access check * happens using its boundaries. @@ -3649,7 +3859,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_perf_event_read && func_id != BPF_FUNC_perf_event_output && func_id != BPF_FUNC_skb_output && - func_id != BPF_FUNC_perf_event_read_value) + func_id != BPF_FUNC_perf_event_read_value && + func_id != BPF_FUNC_xdp_output) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -3693,14 +3904,16 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_sk_redirect_map && func_id != BPF_FUNC_sock_map_update && func_id != BPF_FUNC_map_delete_elem && - func_id != BPF_FUNC_msg_redirect_map) + func_id != BPF_FUNC_msg_redirect_map && + func_id != BPF_FUNC_sk_select_reuseport) goto error; break; case BPF_MAP_TYPE_SOCKHASH: if (func_id != BPF_FUNC_sk_redirect_hash && func_id != BPF_FUNC_sock_hash_update && func_id != BPF_FUNC_map_delete_elem && - func_id != BPF_FUNC_msg_redirect_hash) + func_id != BPF_FUNC_msg_redirect_hash && + func_id != BPF_FUNC_sk_select_reuseport) goto error; break; case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: @@ -3737,6 +3950,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_perf_event_output: case BPF_FUNC_perf_event_read_value: case BPF_FUNC_skb_output: + case BPF_FUNC_xdp_output: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; @@ -3774,7 +3988,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_sk_select_reuseport: - if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) + if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY && + map->map_type != BPF_MAP_TYPE_SOCKMAP && + map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; case BPF_FUNC_map_peek_elem: @@ -4117,10 +4333,11 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, func_id != BPF_FUNC_probe_read_str)) return; - ret_reg->smax_value = meta->msize_smax_value; - ret_reg->umax_value = meta->msize_umax_value; + ret_reg->smax_value = meta->msize_max_value; + ret_reg->s32_max_value = meta->msize_max_value; __reg_deduce_bounds(ret_reg); __reg_bound_offset(ret_reg); + __update_reg_bounds(ret_reg); } static int @@ -4427,7 +4644,17 @@ static bool signed_add_overflows(s64 a, s64 b) return res < a; } -static bool signed_sub_overflows(s64 a, s64 b) +static bool signed_add32_overflows(s64 a, s64 b) +{ + /* Do the add in u32, where overflow is well-defined */ + s32 res = (s32)((u32)a + (u32)b); + + if (b < 0) + return res > a; + return res < a; +} + +static bool signed_sub_overflows(s32 a, s32 b) { /* Do the sub in u64, where overflow is well-defined */ s64 res = (s64)((u64)a - (u64)b); @@ -4437,6 +4664,16 @@ static bool signed_sub_overflows(s64 a, s64 b) return res > a; } +static bool signed_sub32_overflows(s32 a, s32 b) +{ + /* Do the sub in u64, where overflow is well-defined */ + s32 res = (s32)((u32)a - (u32)b); + + if (b < 0) + return res < a; + return res > a; +} + static bool check_reg_sane_offset(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, enum bpf_reg_type type) @@ -4673,6 +4910,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, !check_reg_sane_offset(env, ptr_reg, ptr_reg->type)) return -EINVAL; + /* pointer types do not carry 32-bit bounds at the moment. */ + __mark_reg32_unbounded(dst_reg); + switch (opcode) { case BPF_ADD: ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); @@ -4836,6 +5076,518 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return 0; } +static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 smin_val = src_reg->s32_min_value; + s32 smax_val = src_reg->s32_max_value; + u32 umin_val = src_reg->u32_min_value; + u32 umax_val = src_reg->u32_max_value; + + if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) || + signed_add32_overflows(dst_reg->s32_max_value, smax_val)) { + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + } else { + dst_reg->s32_min_value += smin_val; + dst_reg->s32_max_value += smax_val; + } + if (dst_reg->u32_min_value + umin_val < umin_val || + dst_reg->u32_max_value + umax_val < umax_val) { + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + } else { + dst_reg->u32_min_value += umin_val; + dst_reg->u32_max_value += umax_val; + } +} + +static void scalar_min_max_add(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 smin_val = src_reg->smin_value; + s64 smax_val = src_reg->smax_value; + u64 umin_val = src_reg->umin_value; + u64 umax_val = src_reg->umax_value; + + if (signed_add_overflows(dst_reg->smin_value, smin_val) || + signed_add_overflows(dst_reg->smax_value, smax_val)) { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + dst_reg->smin_value += smin_val; + dst_reg->smax_value += smax_val; + } + if (dst_reg->umin_value + umin_val < umin_val || + dst_reg->umax_value + umax_val < umax_val) { + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + } else { + dst_reg->umin_value += umin_val; + dst_reg->umax_value += umax_val; + } +} + +static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 smin_val = src_reg->s32_min_value; + s32 smax_val = src_reg->s32_max_value; + u32 umin_val = src_reg->u32_min_value; + u32 umax_val = src_reg->u32_max_value; + + if (signed_sub32_overflows(dst_reg->s32_min_value, smax_val) || + signed_sub32_overflows(dst_reg->s32_max_value, smin_val)) { + /* Overflow possible, we know nothing */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + } else { + dst_reg->s32_min_value -= smax_val; + dst_reg->s32_max_value -= smin_val; + } + if (dst_reg->u32_min_value < umax_val) { + /* Overflow possible, we know nothing */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + } else { + /* Cannot overflow (as long as bounds are consistent) */ + dst_reg->u32_min_value -= umax_val; + dst_reg->u32_max_value -= umin_val; + } +} + +static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 smin_val = src_reg->smin_value; + s64 smax_val = src_reg->smax_value; + u64 umin_val = src_reg->umin_value; + u64 umax_val = src_reg->umax_value; + + if (signed_sub_overflows(dst_reg->smin_value, smax_val) || + signed_sub_overflows(dst_reg->smax_value, smin_val)) { + /* Overflow possible, we know nothing */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + dst_reg->smin_value -= smax_val; + dst_reg->smax_value -= smin_val; + } + if (dst_reg->umin_value < umax_val) { + /* Overflow possible, we know nothing */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + } else { + /* Cannot overflow (as long as bounds are consistent) */ + dst_reg->umin_value -= umax_val; + dst_reg->umax_value -= umin_val; + } +} + +static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 smin_val = src_reg->s32_min_value; + u32 umin_val = src_reg->u32_min_value; + u32 umax_val = src_reg->u32_max_value; + + if (smin_val < 0 || dst_reg->s32_min_value < 0) { + /* Ain't nobody got time to multiply that sign */ + __mark_reg32_unbounded(dst_reg); + return; + } + /* Both values are positive, so we can work with unsigned and + * copy the result to signed (unless it exceeds S32_MAX). + */ + if (umax_val > U16_MAX || dst_reg->u32_max_value > U16_MAX) { + /* Potential overflow, we know nothing */ + __mark_reg32_unbounded(dst_reg); + return; + } + dst_reg->u32_min_value *= umin_val; + dst_reg->u32_max_value *= umax_val; + if (dst_reg->u32_max_value > S32_MAX) { + /* Overflow possible, we know nothing */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + } else { + dst_reg->s32_min_value = dst_reg->u32_min_value; + dst_reg->s32_max_value = dst_reg->u32_max_value; + } +} + +static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 smin_val = src_reg->smin_value; + u64 umin_val = src_reg->umin_value; + u64 umax_val = src_reg->umax_value; + + if (smin_val < 0 || dst_reg->smin_value < 0) { + /* Ain't nobody got time to multiply that sign */ + __mark_reg64_unbounded(dst_reg); + return; + } + /* Both values are positive, so we can work with unsigned and + * copy the result to signed (unless it exceeds S64_MAX). + */ + if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) { + /* Potential overflow, we know nothing */ + __mark_reg64_unbounded(dst_reg); + return; + } + dst_reg->umin_value *= umin_val; + dst_reg->umax_value *= umax_val; + if (dst_reg->umax_value > S64_MAX) { + /* Overflow possible, we know nothing */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + dst_reg->smin_value = dst_reg->umin_value; + dst_reg->smax_value = dst_reg->umax_value; + } +} + +static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + bool src_known = tnum_subreg_is_const(src_reg->var_off); + bool dst_known = tnum_subreg_is_const(dst_reg->var_off); + struct tnum var32_off = tnum_subreg(dst_reg->var_off); + s32 smin_val = src_reg->s32_min_value; + u32 umax_val = src_reg->u32_max_value; + + /* Assuming scalar64_min_max_and will be called so its safe + * to skip updating register for known 32-bit case. + */ + if (src_known && dst_known) + return; + + /* We get our minimum from the var_off, since that's inherently + * bitwise. Our maximum is the minimum of the operands' maxima. + */ + dst_reg->u32_min_value = var32_off.value; + dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val); + if (dst_reg->s32_min_value < 0 || smin_val < 0) { + /* Lose signed bounds when ANDing negative numbers, + * ain't nobody got time for that. + */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + } else { + /* ANDing two positives gives a positive, so safe to + * cast result into s64. + */ + dst_reg->s32_min_value = dst_reg->u32_min_value; + dst_reg->s32_max_value = dst_reg->u32_max_value; + } + +} + +static void scalar_min_max_and(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + bool src_known = tnum_is_const(src_reg->var_off); + bool dst_known = tnum_is_const(dst_reg->var_off); + s64 smin_val = src_reg->smin_value; + u64 umax_val = src_reg->umax_value; + + if (src_known && dst_known) { + __mark_reg_known(dst_reg, dst_reg->var_off.value & + src_reg->var_off.value); + return; + } + + /* We get our minimum from the var_off, since that's inherently + * bitwise. Our maximum is the minimum of the operands' maxima. + */ + dst_reg->umin_value = dst_reg->var_off.value; + dst_reg->umax_value = min(dst_reg->umax_value, umax_val); + if (dst_reg->smin_value < 0 || smin_val < 0) { + /* Lose signed bounds when ANDing negative numbers, + * ain't nobody got time for that. + */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + /* ANDing two positives gives a positive, so safe to + * cast result into s64. + */ + dst_reg->smin_value = dst_reg->umin_value; + dst_reg->smax_value = dst_reg->umax_value; + } + /* We may learn something more from the var_off */ + __update_reg_bounds(dst_reg); +} + +static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + bool src_known = tnum_subreg_is_const(src_reg->var_off); + bool dst_known = tnum_subreg_is_const(dst_reg->var_off); + struct tnum var32_off = tnum_subreg(dst_reg->var_off); + s32 smin_val = src_reg->smin_value; + u32 umin_val = src_reg->umin_value; + + /* Assuming scalar64_min_max_or will be called so it is safe + * to skip updating register for known case. + */ + if (src_known && dst_known) + return; + + /* We get our maximum from the var_off, and our minimum is the + * maximum of the operands' minima + */ + dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val); + dst_reg->u32_max_value = var32_off.value | var32_off.mask; + if (dst_reg->s32_min_value < 0 || smin_val < 0) { + /* Lose signed bounds when ORing negative numbers, + * ain't nobody got time for that. + */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + } else { + /* ORing two positives gives a positive, so safe to + * cast result into s64. + */ + dst_reg->s32_min_value = dst_reg->umin_value; + dst_reg->s32_max_value = dst_reg->umax_value; + } +} + +static void scalar_min_max_or(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + bool src_known = tnum_is_const(src_reg->var_off); + bool dst_known = tnum_is_const(dst_reg->var_off); + s64 smin_val = src_reg->smin_value; + u64 umin_val = src_reg->umin_value; + + if (src_known && dst_known) { + __mark_reg_known(dst_reg, dst_reg->var_off.value | + src_reg->var_off.value); + return; + } + + /* We get our maximum from the var_off, and our minimum is the + * maximum of the operands' minima + */ + dst_reg->umin_value = max(dst_reg->umin_value, umin_val); + dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; + if (dst_reg->smin_value < 0 || smin_val < 0) { + /* Lose signed bounds when ORing negative numbers, + * ain't nobody got time for that. + */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + /* ORing two positives gives a positive, so safe to + * cast result into s64. + */ + dst_reg->smin_value = dst_reg->umin_value; + dst_reg->smax_value = dst_reg->umax_value; + } + /* We may learn something more from the var_off */ + __update_reg_bounds(dst_reg); +} + +static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, + u64 umin_val, u64 umax_val) +{ + /* We lose all sign bit information (except what we can pick + * up from var_off) + */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + /* If we might shift our top bit out, then we know nothing */ + if (umax_val > 31 || dst_reg->u32_max_value > 1ULL << (31 - umax_val)) { + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + } else { + dst_reg->u32_min_value <<= umin_val; + dst_reg->u32_max_value <<= umax_val; + } +} + +static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u32 umax_val = src_reg->u32_max_value; + u32 umin_val = src_reg->u32_min_value; + /* u32 alu operation will zext upper bits */ + struct tnum subreg = tnum_subreg(dst_reg->var_off); + + __scalar32_min_max_lsh(dst_reg, umin_val, umax_val); + dst_reg->var_off = tnum_subreg(tnum_lshift(subreg, umin_val)); + /* Not required but being careful mark reg64 bounds as unknown so + * that we are forced to pick them up from tnum and zext later and + * if some path skips this step we are still safe. + */ + __mark_reg64_unbounded(dst_reg); + __update_reg32_bounds(dst_reg); +} + +static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg, + u64 umin_val, u64 umax_val) +{ + /* Special case <<32 because it is a common compiler pattern to sign + * extend subreg by doing <<32 s>>32. In this case if 32bit bounds are + * positive we know this shift will also be positive so we can track + * bounds correctly. Otherwise we lose all sign bit information except + * what we can pick up from var_off. Perhaps we can generalize this + * later to shifts of any length. + */ + if (umin_val == 32 && umax_val == 32 && dst_reg->s32_max_value >= 0) + dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32; + else + dst_reg->smax_value = S64_MAX; + + if (umin_val == 32 && umax_val == 32 && dst_reg->s32_min_value >= 0) + dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32; + else + dst_reg->smin_value = S64_MIN; + + /* If we might shift our top bit out, then we know nothing */ + if (dst_reg->umax_value > 1ULL << (63 - umax_val)) { + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + } else { + dst_reg->umin_value <<= umin_val; + dst_reg->umax_value <<= umax_val; + } +} + +static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 umax_val = src_reg->umax_value; + u64 umin_val = src_reg->umin_value; + + /* scalar64 calc uses 32bit unshifted bounds so must be called first */ + __scalar64_min_max_lsh(dst_reg, umin_val, umax_val); + __scalar32_min_max_lsh(dst_reg, umin_val, umax_val); + + dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val); + /* We may learn something more from the var_off */ + __update_reg_bounds(dst_reg); +} + +static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + struct tnum subreg = tnum_subreg(dst_reg->var_off); + u32 umax_val = src_reg->u32_max_value; + u32 umin_val = src_reg->u32_min_value; + + /* BPF_RSH is an unsigned shift. If the value in dst_reg might + * be negative, then either: + * 1) src_reg might be zero, so the sign bit of the result is + * unknown, so we lose our signed bounds + * 2) it's known negative, thus the unsigned bounds capture the + * signed bounds + * 3) the signed bounds cross zero, so they tell us nothing + * about the result + * If the value in dst_reg is known nonnegative, then again the + * unsigned bounts capture the signed bounds. + * Thus, in all cases it suffices to blow away our signed bounds + * and rely on inferring new ones from the unsigned bounds and + * var_off of the result. + */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + + dst_reg->var_off = tnum_rshift(subreg, umin_val); + dst_reg->u32_min_value >>= umax_val; + dst_reg->u32_max_value >>= umin_val; + + __mark_reg64_unbounded(dst_reg); + __update_reg32_bounds(dst_reg); +} + +static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 umax_val = src_reg->umax_value; + u64 umin_val = src_reg->umin_value; + + /* BPF_RSH is an unsigned shift. If the value in dst_reg might + * be negative, then either: + * 1) src_reg might be zero, so the sign bit of the result is + * unknown, so we lose our signed bounds + * 2) it's known negative, thus the unsigned bounds capture the + * signed bounds + * 3) the signed bounds cross zero, so they tell us nothing + * about the result + * If the value in dst_reg is known nonnegative, then again the + * unsigned bounts capture the signed bounds. + * Thus, in all cases it suffices to blow away our signed bounds + * and rely on inferring new ones from the unsigned bounds and + * var_off of the result. + */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); + dst_reg->umin_value >>= umax_val; + dst_reg->umax_value >>= umin_val; + + /* Its not easy to operate on alu32 bounds here because it depends + * on bits being shifted in. Take easy way out and mark unbounded + * so we can recalculate later from tnum. + */ + __mark_reg32_unbounded(dst_reg); + __update_reg_bounds(dst_reg); +} + +static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 umin_val = src_reg->u32_min_value; + + /* Upon reaching here, src_known is true and + * umax_val is equal to umin_val. + */ + dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val); + dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val); + + dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32); + + /* blow away the dst_reg umin_value/umax_value and rely on + * dst_reg var_off to refine the result. + */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + + __mark_reg64_unbounded(dst_reg); + __update_reg32_bounds(dst_reg); +} + +static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 umin_val = src_reg->umin_value; + + /* Upon reaching here, src_known is true and umax_val is equal + * to umin_val. + */ + dst_reg->smin_value >>= umin_val; + dst_reg->smax_value >>= umin_val; + + dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64); + + /* blow away the dst_reg umin_value/umax_value and rely on + * dst_reg var_off to refine the result. + */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + + /* Its not easy to operate on alu32 bounds here because it depends + * on bits being shifted in from upper 32-bits. Take easy way out + * and mark unbounded so we can recalculate later from tnum. + */ + __mark_reg32_unbounded(dst_reg); + __update_reg_bounds(dst_reg); +} + /* WARNING: This function does calculations on 64-bit values, but the actual * execution may occur on 32-bit values. Therefore, things like bitshifts * need extra checks in the 32-bit case. @@ -4850,33 +5602,47 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, bool src_known, dst_known; s64 smin_val, smax_val; u64 umin_val, umax_val; + s32 s32_min_val, s32_max_val; + u32 u32_min_val, u32_max_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; u32 dst = insn->dst_reg; int ret; - - if (insn_bitness == 32) { - /* Relevant for 32-bit RSH: Information can propagate towards - * LSB, so it isn't sufficient to only truncate the output to - * 32 bits. - */ - coerce_reg_to_size(dst_reg, 4); - coerce_reg_to_size(&src_reg, 4); - } + bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; umin_val = src_reg.umin_value; umax_val = src_reg.umax_value; - src_known = tnum_is_const(src_reg.var_off); - dst_known = tnum_is_const(dst_reg->var_off); - if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || - smin_val > smax_val || umin_val > umax_val) { - /* Taint dst register if offset had invalid bounds derived from - * e.g. dead branches. - */ - __mark_reg_unknown(env, dst_reg); - return 0; + s32_min_val = src_reg.s32_min_value; + s32_max_val = src_reg.s32_max_value; + u32_min_val = src_reg.u32_min_value; + u32_max_val = src_reg.u32_max_value; + + if (alu32) { + src_known = tnum_subreg_is_const(src_reg.var_off); + dst_known = tnum_subreg_is_const(dst_reg->var_off); + if ((src_known && + (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) || + s32_min_val > s32_max_val || u32_min_val > u32_max_val) { + /* Taint dst register if offset had invalid bounds + * derived from e.g. dead branches. + */ + __mark_reg_unknown(env, dst_reg); + return 0; + } + } else { + src_known = tnum_is_const(src_reg.var_off); + dst_known = tnum_is_const(dst_reg->var_off); + if ((src_known && + (smin_val != smax_val || umin_val != umax_val)) || + smin_val > smax_val || umin_val > umax_val) { + /* Taint dst register if offset had invalid bounds + * derived from e.g. dead branches. + */ + __mark_reg_unknown(env, dst_reg); + return 0; + } } if (!src_known && @@ -4885,6 +5651,20 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, return 0; } + /* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops. + * There are two classes of instructions: The first class we track both + * alu32 and alu64 sign/unsigned bounds independently this provides the + * greatest amount of precision when alu operations are mixed with jmp32 + * operations. These operations are BPF_ADD, BPF_SUB, BPF_MUL, BPF_ADD, + * and BPF_OR. This is possible because these ops have fairly easy to + * understand and calculate behavior in both 32-bit and 64-bit alu ops. + * See alu32 verifier tests for examples. The second class of + * operations, BPF_LSH, BPF_RSH, and BPF_ARSH, however are not so easy + * with regards to tracking sign/unsigned bounds because the bits may + * cross subreg boundaries in the alu64 case. When this happens we mark + * the reg unbounded in the subreg bound space and use the resulting + * tnum to calculate an approximation of the sign/unsigned bounds. + */ switch (opcode) { case BPF_ADD: ret = sanitize_val_alu(env, insn); @@ -4892,22 +5672,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d tried to add from different pointers or scalars\n", dst); return ret; } - if (signed_add_overflows(dst_reg->smin_value, smin_val) || - signed_add_overflows(dst_reg->smax_value, smax_val)) { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - dst_reg->smin_value += smin_val; - dst_reg->smax_value += smax_val; - } - if (dst_reg->umin_value + umin_val < umin_val || - dst_reg->umax_value + umax_val < umax_val) { - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - dst_reg->umin_value += umin_val; - dst_reg->umax_value += umax_val; - } + scalar32_min_max_add(dst_reg, &src_reg); + scalar_min_max_add(dst_reg, &src_reg); dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); break; case BPF_SUB: @@ -4916,111 +5682,24 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d tried to sub from different pointers or scalars\n", dst); return ret; } - if (signed_sub_overflows(dst_reg->smin_value, smax_val) || - signed_sub_overflows(dst_reg->smax_value, smin_val)) { - /* Overflow possible, we know nothing */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - dst_reg->smin_value -= smax_val; - dst_reg->smax_value -= smin_val; - } - if (dst_reg->umin_value < umax_val) { - /* Overflow possible, we know nothing */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - /* Cannot overflow (as long as bounds are consistent) */ - dst_reg->umin_value -= umax_val; - dst_reg->umax_value -= umin_val; - } + scalar32_min_max_sub(dst_reg, &src_reg); + scalar_min_max_sub(dst_reg, &src_reg); dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off); break; case BPF_MUL: dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off); - if (smin_val < 0 || dst_reg->smin_value < 0) { - /* Ain't nobody got time to multiply that sign */ - __mark_reg_unbounded(dst_reg); - __update_reg_bounds(dst_reg); - break; - } - /* Both values are positive, so we can work with unsigned and - * copy the result to signed (unless it exceeds S64_MAX). - */ - if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) { - /* Potential overflow, we know nothing */ - __mark_reg_unbounded(dst_reg); - /* (except what we can learn from the var_off) */ - __update_reg_bounds(dst_reg); - break; - } - dst_reg->umin_value *= umin_val; - dst_reg->umax_value *= umax_val; - if (dst_reg->umax_value > S64_MAX) { - /* Overflow possible, we know nothing */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } + scalar32_min_max_mul(dst_reg, &src_reg); + scalar_min_max_mul(dst_reg, &src_reg); break; case BPF_AND: - if (src_known && dst_known) { - __mark_reg_known(dst_reg, dst_reg->var_off.value & - src_reg.var_off.value); - break; - } - /* We get our minimum from the var_off, since that's inherently - * bitwise. Our maximum is the minimum of the operands' maxima. - */ dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off); - dst_reg->umin_value = dst_reg->var_off.value; - dst_reg->umax_value = min(dst_reg->umax_value, umax_val); - if (dst_reg->smin_value < 0 || smin_val < 0) { - /* Lose signed bounds when ANDing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - /* ANDing two positives gives a positive, so safe to - * cast result into s64. - */ - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } - /* We may learn something more from the var_off */ - __update_reg_bounds(dst_reg); + scalar32_min_max_and(dst_reg, &src_reg); + scalar_min_max_and(dst_reg, &src_reg); break; case BPF_OR: - if (src_known && dst_known) { - __mark_reg_known(dst_reg, dst_reg->var_off.value | - src_reg.var_off.value); - break; - } - /* We get our maximum from the var_off, and our minimum is the - * maximum of the operands' minima - */ dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off); - dst_reg->umin_value = max(dst_reg->umin_value, umin_val); - dst_reg->umax_value = dst_reg->var_off.value | - dst_reg->var_off.mask; - if (dst_reg->smin_value < 0 || smin_val < 0) { - /* Lose signed bounds when ORing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - /* ORing two positives gives a positive, so safe to - * cast result into s64. - */ - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } - /* We may learn something more from the var_off */ - __update_reg_bounds(dst_reg); + scalar32_min_max_or(dst_reg, &src_reg); + scalar_min_max_or(dst_reg, &src_reg); break; case BPF_LSH: if (umax_val >= insn_bitness) { @@ -5030,22 +5709,10 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, mark_reg_unknown(env, regs, insn->dst_reg); break; } - /* We lose all sign bit information (except what we can pick - * up from var_off) - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - /* If we might shift our top bit out, then we know nothing */ - if (dst_reg->umax_value > 1ULL << (63 - umax_val)) { - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - dst_reg->umin_value <<= umin_val; - dst_reg->umax_value <<= umax_val; - } - dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val); - /* We may learn something more from the var_off */ - __update_reg_bounds(dst_reg); + if (alu32) + scalar32_min_max_lsh(dst_reg, &src_reg); + else + scalar_min_max_lsh(dst_reg, &src_reg); break; case BPF_RSH: if (umax_val >= insn_bitness) { @@ -5055,27 +5722,10 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, mark_reg_unknown(env, regs, insn->dst_reg); break; } - /* BPF_RSH is an unsigned shift. If the value in dst_reg might - * be negative, then either: - * 1) src_reg might be zero, so the sign bit of the result is - * unknown, so we lose our signed bounds - * 2) it's known negative, thus the unsigned bounds capture the - * signed bounds - * 3) the signed bounds cross zero, so they tell us nothing - * about the result - * If the value in dst_reg is known nonnegative, then again the - * unsigned bounts capture the signed bounds. - * Thus, in all cases it suffices to blow away our signed bounds - * and rely on inferring new ones from the unsigned bounds and - * var_off of the result. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); - dst_reg->umin_value >>= umax_val; - dst_reg->umax_value >>= umin_val; - /* We may learn something more from the var_off */ - __update_reg_bounds(dst_reg); + if (alu32) + scalar32_min_max_rsh(dst_reg, &src_reg); + else + scalar_min_max_rsh(dst_reg, &src_reg); break; case BPF_ARSH: if (umax_val >= insn_bitness) { @@ -5085,38 +5735,21 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, mark_reg_unknown(env, regs, insn->dst_reg); break; } - - /* Upon reaching here, src_known is true and - * umax_val is equal to umin_val. - */ - if (insn_bitness == 32) { - dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val); - dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val); - } else { - dst_reg->smin_value >>= umin_val; - dst_reg->smax_value >>= umin_val; - } - - dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, - insn_bitness); - - /* blow away the dst_reg umin_value/umax_value and rely on - * dst_reg var_off to refine the result. - */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - __update_reg_bounds(dst_reg); + if (alu32) + scalar32_min_max_arsh(dst_reg, &src_reg); + else + scalar_min_max_arsh(dst_reg, &src_reg); break; default: mark_reg_unknown(env, regs, insn->dst_reg); break; } - if (BPF_CLASS(insn->code) != BPF_ALU64) { - /* 32-bit ALU ops are (32,32)->32 */ - coerce_reg_to_size(dst_reg, 4); - } + /* ALU32 ops are zero extended into 64bit register */ + if (alu32) + zext_32_to_64(dst_reg); + __update_reg_bounds(dst_reg); __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); return 0; @@ -5290,7 +5923,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) mark_reg_unknown(env, regs, insn->dst_reg); } - coerce_reg_to_size(dst_reg, 4); + zext_32_to_64(dst_reg); } } else { /* case: R = imm @@ -5460,55 +6093,83 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, new_range); } -/* compute branch direction of the expression "if (reg opcode val) goto target;" - * and return: - * 1 - branch will be taken and "goto target" will be executed - * 0 - branch will not be taken and fall-through to next insn - * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10] - */ -static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, - bool is_jmp32) +static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) { - struct bpf_reg_state reg_lo; - s64 sval; + struct tnum subreg = tnum_subreg(reg->var_off); + s32 sval = (s32)val; - if (__is_pointer_value(false, reg)) - return -1; + switch (opcode) { + case BPF_JEQ: + if (tnum_is_const(subreg)) + return !!tnum_equals_const(subreg, val); + break; + case BPF_JNE: + if (tnum_is_const(subreg)) + return !tnum_equals_const(subreg, val); + break; + case BPF_JSET: + if ((~subreg.mask & subreg.value) & val) + return 1; + if (!((subreg.mask | subreg.value) & val)) + return 0; + break; + case BPF_JGT: + if (reg->u32_min_value > val) + return 1; + else if (reg->u32_max_value <= val) + return 0; + break; + case BPF_JSGT: + if (reg->s32_min_value > sval) + return 1; + else if (reg->s32_max_value < sval) + return 0; + break; + case BPF_JLT: + if (reg->u32_max_value < val) + return 1; + else if (reg->u32_min_value >= val) + return 0; + break; + case BPF_JSLT: + if (reg->s32_max_value < sval) + return 1; + else if (reg->s32_min_value >= sval) + return 0; + break; + case BPF_JGE: + if (reg->u32_min_value >= val) + return 1; + else if (reg->u32_max_value < val) + return 0; + break; + case BPF_JSGE: + if (reg->s32_min_value >= sval) + return 1; + else if (reg->s32_max_value < sval) + return 0; + break; + case BPF_JLE: + if (reg->u32_max_value <= val) + return 1; + else if (reg->u32_min_value > val) + return 0; + break; + case BPF_JSLE: + if (reg->s32_max_value <= sval) + return 1; + else if (reg->s32_min_value > sval) + return 0; + break; + } - if (is_jmp32) { - reg_lo = *reg; - reg = ®_lo; - /* For JMP32, only low 32 bits are compared, coerce_reg_to_size - * could truncate high bits and update umin/umax according to - * information of low bits. - */ - coerce_reg_to_size(reg, 4); - /* smin/smax need special handling. For example, after coerce, - * if smin_value is 0x00000000ffffffffLL, the value is -1 when - * used as operand to JMP32. It is a negative number from s32's - * point of view, while it is a positive number when seen as - * s64. The smin/smax are kept as s64, therefore, when used with - * JMP32, they need to be transformed into s32, then sign - * extended back to s64. - * - * Also, smin/smax were copied from umin/umax. If umin/umax has - * different sign bit, then min/max relationship doesn't - * maintain after casting into s32, for this case, set smin/smax - * to safest range. - */ - if ((reg->umax_value ^ reg->umin_value) & - (1ULL << 31)) { - reg->smin_value = S32_MIN; - reg->smax_value = S32_MAX; - } - reg->smin_value = (s64)(s32)reg->smin_value; - reg->smax_value = (s64)(s32)reg->smax_value; + return -1; +} - val = (u32)val; - sval = (s64)(s32)val; - } else { - sval = (s64)val; - } + +static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) +{ + s64 sval = (s64)val; switch (opcode) { case BPF_JEQ: @@ -5578,27 +6239,22 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, return -1; } -/* Generate min value of the high 32-bit from TNUM info. */ -static u64 gen_hi_min(struct tnum var) -{ - return var.value & ~0xffffffffULL; -} - -/* Generate max value of the high 32-bit from TNUM info. */ -static u64 gen_hi_max(struct tnum var) -{ - return (var.value | var.mask) & ~0xffffffffULL; -} - -/* Return true if VAL is compared with a s64 sign extended from s32, and they - * are with the same signedness. +/* compute branch direction of the expression "if (reg opcode val) goto target;" + * and return: + * 1 - branch will be taken and "goto target" will be executed + * 0 - branch will not be taken and fall-through to next insn + * -1 - unknown. Example: "if (reg < 5)" is unknown when register value + * range [0,10] */ -static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg) +static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, + bool is_jmp32) { - return ((s32)sval >= 0 && - reg->smin_value >= 0 && reg->smax_value <= S32_MAX) || - ((s32)sval < 0 && - reg->smax_value <= 0 && reg->smin_value >= S32_MIN); + if (__is_pointer_value(false, reg)) + return -1; + + if (is_jmp32) + return is_branch32_taken(reg, val, opcode); + return is_branch64_taken(reg, val, opcode); } /* Adjusts the register min/max values in the case that the dst_reg is the @@ -5607,10 +6263,16 @@ static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg) * In JEQ/JNE cases we also adjust the var_off values. */ static void reg_set_min_max(struct bpf_reg_state *true_reg, - struct bpf_reg_state *false_reg, u64 val, + struct bpf_reg_state *false_reg, + u64 val, u32 val32, u8 opcode, bool is_jmp32) { - s64 sval; + struct tnum false_32off = tnum_subreg(false_reg->var_off); + struct tnum false_64off = false_reg->var_off; + struct tnum true_32off = tnum_subreg(true_reg->var_off); + struct tnum true_64off = true_reg->var_off; + s64 sval = (s64)val; + s32 sval32 = (s32)val32; /* If the dst_reg is a pointer, we can't learn anything about its * variable offset from the compare (unless src_reg were a pointer into @@ -5621,9 +6283,6 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, if (__is_pointer_value(false, false_reg)) return; - val = is_jmp32 ? (u32)val : val; - sval = is_jmp32 ? (s64)(s32)val : (s64)val; - switch (opcode) { case BPF_JEQ: case BPF_JNE: @@ -5635,211 +6294,150 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, * if it is true we know the value for sure. Likewise for * BPF_JNE. */ - if (is_jmp32) { - u64 old_v = reg->var_off.value; - u64 hi_mask = ~0xffffffffULL; - - reg->var_off.value = (old_v & hi_mask) | val; - reg->var_off.mask &= hi_mask; - } else { + if (is_jmp32) + __mark_reg32_known(reg, val32); + else __mark_reg_known(reg, val); - } break; } case BPF_JSET: - false_reg->var_off = tnum_and(false_reg->var_off, - tnum_const(~val)); - if (is_power_of_2(val)) - true_reg->var_off = tnum_or(true_reg->var_off, - tnum_const(val)); + if (is_jmp32) { + false_32off = tnum_and(false_32off, tnum_const(~val32)); + if (is_power_of_2(val32)) + true_32off = tnum_or(true_32off, + tnum_const(val32)); + } else { + false_64off = tnum_and(false_64off, tnum_const(~val)); + if (is_power_of_2(val)) + true_64off = tnum_or(true_64off, + tnum_const(val)); + } break; case BPF_JGE: case BPF_JGT: { - u64 false_umax = opcode == BPF_JGT ? val : val - 1; - u64 true_umin = opcode == BPF_JGT ? val + 1 : val; - if (is_jmp32) { - false_umax += gen_hi_max(false_reg->var_off); - true_umin += gen_hi_min(true_reg->var_off); + u32 false_umax = opcode == BPF_JGT ? val32 : val32 - 1; + u32 true_umin = opcode == BPF_JGT ? val32 + 1 : val32; + + false_reg->u32_max_value = min(false_reg->u32_max_value, + false_umax); + true_reg->u32_min_value = max(true_reg->u32_min_value, + true_umin); + } else { + u64 false_umax = opcode == BPF_JGT ? val : val - 1; + u64 true_umin = opcode == BPF_JGT ? val + 1 : val; + + false_reg->umax_value = min(false_reg->umax_value, false_umax); + true_reg->umin_value = max(true_reg->umin_value, true_umin); } - false_reg->umax_value = min(false_reg->umax_value, false_umax); - true_reg->umin_value = max(true_reg->umin_value, true_umin); break; } case BPF_JSGE: case BPF_JSGT: { - s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; - s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; + if (is_jmp32) { + s32 false_smax = opcode == BPF_JSGT ? sval32 : sval32 - 1; + s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32; - /* If the full s64 was not sign-extended from s32 then don't - * deduct further info. - */ - if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) - break; - false_reg->smax_value = min(false_reg->smax_value, false_smax); - true_reg->smin_value = max(true_reg->smin_value, true_smin); + false_reg->s32_max_value = min(false_reg->s32_max_value, false_smax); + true_reg->s32_min_value = max(true_reg->s32_min_value, true_smin); + } else { + s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; + s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; + + false_reg->smax_value = min(false_reg->smax_value, false_smax); + true_reg->smin_value = max(true_reg->smin_value, true_smin); + } break; } case BPF_JLE: case BPF_JLT: { - u64 false_umin = opcode == BPF_JLT ? val : val + 1; - u64 true_umax = opcode == BPF_JLT ? val - 1 : val; - if (is_jmp32) { - false_umin += gen_hi_min(false_reg->var_off); - true_umax += gen_hi_max(true_reg->var_off); + u32 false_umin = opcode == BPF_JLT ? val32 : val32 + 1; + u32 true_umax = opcode == BPF_JLT ? val32 - 1 : val32; + + false_reg->u32_min_value = max(false_reg->u32_min_value, + false_umin); + true_reg->u32_max_value = min(true_reg->u32_max_value, + true_umax); + } else { + u64 false_umin = opcode == BPF_JLT ? val : val + 1; + u64 true_umax = opcode == BPF_JLT ? val - 1 : val; + + false_reg->umin_value = max(false_reg->umin_value, false_umin); + true_reg->umax_value = min(true_reg->umax_value, true_umax); } - false_reg->umin_value = max(false_reg->umin_value, false_umin); - true_reg->umax_value = min(true_reg->umax_value, true_umax); break; } case BPF_JSLE: case BPF_JSLT: { - s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; - s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; + if (is_jmp32) { + s32 false_smin = opcode == BPF_JSLT ? sval32 : sval32 + 1; + s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32; - if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) - break; - false_reg->smin_value = max(false_reg->smin_value, false_smin); - true_reg->smax_value = min(true_reg->smax_value, true_smax); + false_reg->s32_min_value = max(false_reg->s32_min_value, false_smin); + true_reg->s32_max_value = min(true_reg->s32_max_value, true_smax); + } else { + s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; + s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; + + false_reg->smin_value = max(false_reg->smin_value, false_smin); + true_reg->smax_value = min(true_reg->smax_value, true_smax); + } break; } default: - break; + return; } - __reg_deduce_bounds(false_reg); - __reg_deduce_bounds(true_reg); - /* We might have learned some bits from the bounds. */ - __reg_bound_offset(false_reg); - __reg_bound_offset(true_reg); if (is_jmp32) { - __reg_bound_offset32(false_reg); - __reg_bound_offset32(true_reg); + false_reg->var_off = tnum_or(tnum_clear_subreg(false_64off), + tnum_subreg(false_32off)); + true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off), + tnum_subreg(true_32off)); + __reg_combine_32_into_64(false_reg); + __reg_combine_32_into_64(true_reg); + } else { + false_reg->var_off = false_64off; + true_reg->var_off = true_64off; + __reg_combine_64_into_32(false_reg); + __reg_combine_64_into_32(true_reg); } - /* Intersecting with the old var_off might have improved our bounds - * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), - * then new var_off is (0; 0x7f...fc) which improves our umax. - */ - __update_reg_bounds(false_reg); - __update_reg_bounds(true_reg); } /* Same as above, but for the case that dst_reg holds a constant and src_reg is * the variable reg. */ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, - struct bpf_reg_state *false_reg, u64 val, + struct bpf_reg_state *false_reg, + u64 val, u32 val32, u8 opcode, bool is_jmp32) { - s64 sval; - - if (__is_pointer_value(false, false_reg)) - return; - - val = is_jmp32 ? (u32)val : val; - sval = is_jmp32 ? (s64)(s32)val : (s64)val; - - switch (opcode) { - case BPF_JEQ: - case BPF_JNE: - { - struct bpf_reg_state *reg = - opcode == BPF_JEQ ? true_reg : false_reg; - - if (is_jmp32) { - u64 old_v = reg->var_off.value; - u64 hi_mask = ~0xffffffffULL; - - reg->var_off.value = (old_v & hi_mask) | val; - reg->var_off.mask &= hi_mask; - } else { - __mark_reg_known(reg, val); - } - break; - } - case BPF_JSET: - false_reg->var_off = tnum_and(false_reg->var_off, - tnum_const(~val)); - if (is_power_of_2(val)) - true_reg->var_off = tnum_or(true_reg->var_off, - tnum_const(val)); - break; - case BPF_JGE: - case BPF_JGT: - { - u64 false_umin = opcode == BPF_JGT ? val : val + 1; - u64 true_umax = opcode == BPF_JGT ? val - 1 : val; - - if (is_jmp32) { - false_umin += gen_hi_min(false_reg->var_off); - true_umax += gen_hi_max(true_reg->var_off); - } - false_reg->umin_value = max(false_reg->umin_value, false_umin); - true_reg->umax_value = min(true_reg->umax_value, true_umax); - break; - } - case BPF_JSGE: - case BPF_JSGT: - { - s64 false_smin = opcode == BPF_JSGT ? sval : sval + 1; - s64 true_smax = opcode == BPF_JSGT ? sval - 1 : sval; - - if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) - break; - false_reg->smin_value = max(false_reg->smin_value, false_smin); - true_reg->smax_value = min(true_reg->smax_value, true_smax); - break; - } - case BPF_JLE: - case BPF_JLT: - { - u64 false_umax = opcode == BPF_JLT ? val : val - 1; - u64 true_umin = opcode == BPF_JLT ? val + 1 : val; - - if (is_jmp32) { - false_umax += gen_hi_max(false_reg->var_off); - true_umin += gen_hi_min(true_reg->var_off); - } - false_reg->umax_value = min(false_reg->umax_value, false_umax); - true_reg->umin_value = max(true_reg->umin_value, true_umin); - break; - } - case BPF_JSLE: - case BPF_JSLT: - { - s64 false_smax = opcode == BPF_JSLT ? sval : sval - 1; - s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval; - - if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) - break; - false_reg->smax_value = min(false_reg->smax_value, false_smax); - true_reg->smin_value = max(true_reg->smin_value, true_smin); - break; - } - default: - break; - } - - __reg_deduce_bounds(false_reg); - __reg_deduce_bounds(true_reg); - /* We might have learned some bits from the bounds. */ - __reg_bound_offset(false_reg); - __reg_bound_offset(true_reg); - if (is_jmp32) { - __reg_bound_offset32(false_reg); - __reg_bound_offset32(true_reg); - } - /* Intersecting with the old var_off might have improved our bounds - * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), - * then new var_off is (0; 0x7f...fc) which improves our umax. + /* How can we transform "a <op> b" into "b <op> a"? */ + static const u8 opcode_flip[16] = { + /* these stay the same */ + [BPF_JEQ >> 4] = BPF_JEQ, + [BPF_JNE >> 4] = BPF_JNE, + [BPF_JSET >> 4] = BPF_JSET, + /* these swap "lesser" and "greater" (L and G in the opcodes) */ + [BPF_JGE >> 4] = BPF_JLE, + [BPF_JGT >> 4] = BPF_JLT, + [BPF_JLE >> 4] = BPF_JGE, + [BPF_JLT >> 4] = BPF_JGT, + [BPF_JSGE >> 4] = BPF_JSLE, + [BPF_JSGT >> 4] = BPF_JSLT, + [BPF_JSLE >> 4] = BPF_JSGE, + [BPF_JSLT >> 4] = BPF_JSGT + }; + opcode = opcode_flip[opcode >> 4]; + /* This uses zero as "not present in table"; luckily the zero opcode, + * BPF_JA, can't get here. */ - __update_reg_bounds(false_reg); - __update_reg_bounds(true_reg); + if (opcode) + reg_set_min_max(true_reg, false_reg, val, val32, opcode, is_jmp32); } /* Regs are known to be equal, so intersect their min/max/var_off */ @@ -6128,13 +6726,22 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, dst_reg = ®s[insn->dst_reg]; is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - if (BPF_SRC(insn->code) == BPF_K) - pred = is_branch_taken(dst_reg, insn->imm, - opcode, is_jmp32); - else if (src_reg->type == SCALAR_VALUE && - tnum_is_const(src_reg->var_off)) - pred = is_branch_taken(dst_reg, src_reg->var_off.value, - opcode, is_jmp32); + if (BPF_SRC(insn->code) == BPF_K) { + pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32); + } else if (src_reg->type == SCALAR_VALUE && + is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) { + pred = is_branch_taken(dst_reg, + tnum_subreg(src_reg->var_off).value, + opcode, + is_jmp32); + } else if (src_reg->type == SCALAR_VALUE && + !is_jmp32 && tnum_is_const(src_reg->var_off)) { + pred = is_branch_taken(dst_reg, + src_reg->var_off.value, + opcode, + is_jmp32); + } + if (pred >= 0) { err = mark_chain_precision(env, insn->dst_reg); if (BPF_SRC(insn->code) == BPF_X && !err) @@ -6168,32 +6775,24 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ if (BPF_SRC(insn->code) == BPF_X) { struct bpf_reg_state *src_reg = ®s[insn->src_reg]; - struct bpf_reg_state lo_reg0 = *dst_reg; - struct bpf_reg_state lo_reg1 = *src_reg; - struct bpf_reg_state *src_lo, *dst_lo; - - dst_lo = &lo_reg0; - src_lo = &lo_reg1; - coerce_reg_to_size(dst_lo, 4); - coerce_reg_to_size(src_lo, 4); if (dst_reg->type == SCALAR_VALUE && src_reg->type == SCALAR_VALUE) { if (tnum_is_const(src_reg->var_off) || - (is_jmp32 && tnum_is_const(src_lo->var_off))) + (is_jmp32 && + tnum_is_const(tnum_subreg(src_reg->var_off)))) reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, - is_jmp32 - ? src_lo->var_off.value - : src_reg->var_off.value, + src_reg->var_off.value, + tnum_subreg(src_reg->var_off).value, opcode, is_jmp32); else if (tnum_is_const(dst_reg->var_off) || - (is_jmp32 && tnum_is_const(dst_lo->var_off))) + (is_jmp32 && + tnum_is_const(tnum_subreg(dst_reg->var_off)))) reg_set_min_max_inv(&other_branch_regs[insn->src_reg], src_reg, - is_jmp32 - ? dst_lo->var_off.value - : dst_reg->var_off.value, + dst_reg->var_off.value, + tnum_subreg(dst_reg->var_off).value, opcode, is_jmp32); else if (!is_jmp32 && (opcode == BPF_JEQ || opcode == BPF_JNE)) @@ -6204,7 +6803,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } } else if (dst_reg->type == SCALAR_VALUE) { reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, insn->imm, opcode, is_jmp32); + dst_reg, insn->imm, (u32)insn->imm, + opcode, is_jmp32); } /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). @@ -6405,8 +7005,9 @@ static int check_return_code(struct bpf_verifier_env *env) struct tnum range = tnum_range(0, 1); int err; - /* The struct_ops func-ptr's return type could be "void" */ - if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS && + /* LSM and struct_ops func-ptr's return type could be "void" */ + if ((env->prog->type == BPF_PROG_TYPE_STRUCT_OPS || + env->prog->type == BPF_PROG_TYPE_LSM) && !prog->aux->attach_func_proto->type) return 0; @@ -8139,26 +8740,48 @@ static bool is_tracing_prog_type(enum bpf_prog_type type) } } +static bool is_preallocated_map(struct bpf_map *map) +{ + if (!check_map_prealloc(map)) + return false; + if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta)) + return false; + return true; +} + static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, struct bpf_prog *prog) { - /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use - * preallocated hash maps, since doing memory allocation - * in overflow_handler can crash depending on where nmi got - * triggered. + /* + * Validate that trace type programs use preallocated hash maps. + * + * For programs attached to PERF events this is mandatory as the + * perf NMI can hit any arbitrary code sequence. + * + * All other trace types using preallocated hash maps are unsafe as + * well because tracepoint or kprobes can be inside locked regions + * of the memory allocator or at a place where a recursion into the + * memory allocator would see inconsistent state. + * + * On RT enabled kernels run-time allocation of all trace type + * programs is strictly prohibited due to lock type constraints. On + * !RT kernels it is allowed for backwards compatibility reasons for + * now, but warnings are emitted so developers are made aware of + * the unsafety and can fix their programs before this is enforced. */ - if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { - if (!check_map_prealloc(map)) { + if (is_tracing_prog_type(prog->type) && !is_preallocated_map(map)) { + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { verbose(env, "perf_event programs can only use preallocated hash map\n"); return -EINVAL; } - if (map->inner_map_meta && - !check_map_prealloc(map->inner_map_meta)) { - verbose(env, "perf_event programs can only use preallocated inner hash map\n"); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + verbose(env, "trace type programs can only use preallocated hash map\n"); return -EINVAL; } + WARN_ONCE(1, "trace type BPF program uses run-time allocation\n"); + verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n"); } if ((is_tracing_prog_type(prog->type) || @@ -9774,6 +10397,26 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return 0; } +#define SECURITY_PREFIX "security_" + +static int check_attach_modify_return(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + unsigned long addr = (unsigned long) prog->aux->trampoline->func.addr; + + /* This is expected to be cleaned up in the future with the KRSI effort + * introducing the LSM_HOOK macro for cleaning up lsm_hooks.h. + */ + if (within_error_injection_list(addr) || + !strncmp(SECURITY_PREFIX, prog->aux->attach_func_name, + sizeof(SECURITY_PREFIX) - 1)) + return 0; + + verbose(env, "fmod_ret attach_btf_id %u (%s) is not modifiable\n", + prog->aux->attach_btf_id, prog->aux->attach_func_name); + + return -EINVAL; +} static int check_attach_btf_id(struct bpf_verifier_env *env) { @@ -9794,7 +10437,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) return check_struct_ops_btf_id(env); - if (prog->type != BPF_PROG_TYPE_TRACING && !prog_extension) + if (prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_LSM && + !prog_extension) return 0; if (!btf_id) { @@ -9924,8 +10569,17 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (!prog_extension) return -EINVAL; /* fallthrough */ + case BPF_MODIFY_RETURN: + case BPF_LSM_MAC: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + prog->aux->attach_func_name = tname; + if (prog->type == BPF_PROG_TYPE_LSM) { + ret = bpf_lsm_verify_prog(&env->log, prog); + if (ret < 0) + return ret; + } + if (!btf_type_is_func(t)) { verbose(env, "attach_btf_id %u is not a function\n", btf_id); @@ -9940,7 +10594,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) tr = bpf_trampoline_lookup(key); if (!tr) return -ENOMEM; - prog->aux->attach_func_name = tname; /* t is either vmlinux type or another program's type */ prog->aux->attach_func_proto = t; mutex_lock(&tr->mutex); @@ -9973,6 +10626,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } tr->func.addr = (void *)addr; prog->aux->trampoline = tr; + + if (prog->expected_attach_type == BPF_MODIFY_RETURN) + ret = check_attach_modify_return(env); out: mutex_unlock(&tr->mutex); if (ret) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index f2d7cea86ffe..191c329e482a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -38,10 +38,7 @@ static bool cgroup_no_v1_named; */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; -/* - * Protects cgroup_subsys->release_agent_path. Modifying it also requires - * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. - */ +/* protects cgroup_subsys->release_agent_path */ static DEFINE_SPINLOCK(release_agent_path_lock); bool cgroup1_ssid_disabled(int ssid) @@ -775,22 +772,29 @@ void cgroup1_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL; + char *pathbuf, *agentbuf; char *argv[3], *envp[3]; int ret; - mutex_lock(&cgroup_mutex); + /* snoop agent path and exit early if empty */ + if (!cgrp->root->release_agent_path[0]) + return; + /* prepare argument buffers */ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf || !strlen(agentbuf)) - goto out; + agentbuf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!pathbuf || !agentbuf) + goto out_free; - spin_lock_irq(&css_set_lock); - ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - spin_unlock_irq(&css_set_lock); + spin_lock(&release_agent_path_lock); + strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); + spin_unlock(&release_agent_path_lock); + if (!agentbuf[0]) + goto out_free; + + ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); if (ret < 0 || ret >= PATH_MAX) - goto out; + goto out_free; argv[0] = agentbuf; argv[1] = pathbuf; @@ -801,11 +805,7 @@ void cgroup1_release_agent(struct work_struct *work) envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; - mutex_unlock(&cgroup_mutex); call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - goto out_free; -out: - mutex_unlock(&cgroup_mutex); out_free: kfree(agentbuf); kfree(pathbuf); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3dead0416b91..06b5ea9d899d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1813,12 +1813,14 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, enum cgroup2_param { Opt_nsdelegate, Opt_memory_localevents, + Opt_memory_recursiveprot, nr__cgroup2_params }; static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("nsdelegate", Opt_nsdelegate), fsparam_flag("memory_localevents", Opt_memory_localevents), + fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), {} }; @@ -1839,6 +1841,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_memory_localevents: ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; return 0; + case Opt_memory_recursiveprot: + ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; + return 0; } return -EINVAL; } @@ -1855,6 +1860,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS; + + if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; } } @@ -1864,6 +1874,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root seq_puts(seq, ",nsdelegate"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) seq_puts(seq, ",memory_localevents"); + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) + seq_puts(seq, ",memory_recursiveprot"); return 0; } @@ -1954,7 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | - KERNFS_ROOT_SUPPORT_EXPORTOP, + KERNFS_ROOT_SUPPORT_EXPORTOP | + KERNFS_ROOT_SUPPORT_USER_XATTR, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); @@ -2714,11 +2727,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, { DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; - int ret; - - ret = cgroup_migrate_vet_dst(dst_cgrp); - if (ret) - return ret; + int ret = 0; /* look up all src csets */ spin_lock_irq(&css_set_lock); @@ -4148,7 +4157,8 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, } else if (likely(!(pos->flags & CSS_RELEASED))) { next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); } else { - list_for_each_entry_rcu(next, &parent->children, sibling) + list_for_each_entry_rcu(next, &parent->children, sibling, + lockdep_is_held(&cgroup_mutex)) if (next->serial_nr > pos->serial_nr) break; } @@ -4391,29 +4401,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) lockdep_assert_held(&css_set_lock); - /* Advance to the next non-empty css_set */ - do { - cset = css_task_iter_next_css_set(it); - if (!cset) { - it->task_pos = NULL; - return; + /* Advance to the next non-empty css_set and find first non-empty tasks list*/ + while ((cset = css_task_iter_next_css_set(it))) { + if (!list_empty(&cset->tasks)) { + it->cur_tasks_head = &cset->tasks; + break; + } else if (!list_empty(&cset->mg_tasks)) { + it->cur_tasks_head = &cset->mg_tasks; + break; + } else if (!list_empty(&cset->dying_tasks)) { + it->cur_tasks_head = &cset->dying_tasks; + break; } - } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); - - if (!list_empty(&cset->tasks)) { - it->task_pos = cset->tasks.next; - it->cur_tasks_head = &cset->tasks; - } else if (!list_empty(&cset->mg_tasks)) { - it->task_pos = cset->mg_tasks.next; - it->cur_tasks_head = &cset->mg_tasks; - } else { - it->task_pos = cset->dying_tasks.next; - it->cur_tasks_head = &cset->dying_tasks; } - - it->tasks_head = &cset->tasks; - it->mg_tasks_head = &cset->mg_tasks; - it->dying_tasks_head = &cset->dying_tasks; + if (!cset) { + it->task_pos = NULL; + return; + } + it->task_pos = it->cur_tasks_head->next; /* * We don't keep css_sets locked across iteration steps and thus @@ -4458,24 +4463,24 @@ static void css_task_iter_advance(struct css_task_iter *it) repeat: if (it->task_pos) { /* - * Advance iterator to find next entry. cset->tasks is - * consumed first and then ->mg_tasks. After ->mg_tasks, - * we move onto the next cset. + * Advance iterator to find next entry. We go through cset + * tasks, mg_tasks and dying_tasks, when consumed we move onto + * the next cset. */ if (it->flags & CSS_TASK_ITER_SKIPPED) it->flags &= ~CSS_TASK_ITER_SKIPPED; else it->task_pos = it->task_pos->next; - if (it->task_pos == it->tasks_head) { - it->task_pos = it->mg_tasks_head->next; - it->cur_tasks_head = it->mg_tasks_head; + if (it->task_pos == &it->cur_cset->tasks) { + it->cur_tasks_head = &it->cur_cset->mg_tasks; + it->task_pos = it->cur_tasks_head->next; } - if (it->task_pos == it->mg_tasks_head) { - it->task_pos = it->dying_tasks_head->next; - it->cur_tasks_head = it->dying_tasks_head; + if (it->task_pos == &it->cur_cset->mg_tasks) { + it->cur_tasks_head = &it->cur_cset->dying_tasks; + it->task_pos = it->cur_tasks_head->next; } - if (it->task_pos == it->dying_tasks_head) + if (it->task_pos == &it->cur_cset->dying_tasks) css_task_iter_advance_css_set(it); } else { /* called from start, proceed to the first cset */ @@ -4493,12 +4498,12 @@ repeat: goto repeat; /* and dying leaders w/o live member threads */ - if (it->cur_tasks_head == it->dying_tasks_head && + if (it->cur_tasks_head == &it->cur_cset->dying_tasks && !atomic_read(&task->signal->live)) goto repeat; } else { /* skip all dying ones */ - if (it->cur_tasks_head == it->dying_tasks_head) + if (it->cur_tasks_head == &it->cur_cset->dying_tasks) goto repeat; } } @@ -4662,13 +4667,28 @@ static int cgroup_procs_show(struct seq_file *s, void *v) return 0; } +static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) +{ + int ret; + struct inode *inode; + + lockdep_assert_held(&cgroup_mutex); + + inode = kernfs_get_inode(sb, cgrp->procs_file.kn); + if (!inode) + return -ENOMEM; + + ret = inode_permission(inode, MAY_WRITE); + iput(inode); + return ret; +} + static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, struct super_block *sb) { struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *com_cgrp = src_cgrp; - struct inode *inode; int ret; lockdep_assert_held(&cgroup_mutex); @@ -4678,12 +4698,7 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, com_cgrp = cgroup_parent(com_cgrp); /* %current should be authorized to migrate to the common ancestor */ - inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); - if (!inode) - return -ENOMEM; - - ret = inode_permission(inode, MAY_WRITE); - iput(inode); + ret = cgroup_may_write(com_cgrp, sb); if (ret) return ret; @@ -4699,6 +4714,26 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, return 0; } +static int cgroup_attach_permissions(struct cgroup *src_cgrp, + struct cgroup *dst_cgrp, + struct super_block *sb, bool threadgroup) +{ + int ret = 0; + + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb); + if (ret) + return ret; + + ret = cgroup_migrate_vet_dst(dst_cgrp); + if (ret) + return ret; + + if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)) + ret = -EOPNOTSUPP; + + return ret; +} + static ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -4721,8 +4756,8 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb); + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, true); if (ret) goto out_finish; @@ -4766,16 +4801,11 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, spin_unlock_irq(&css_set_lock); /* thread migrations follow the cgroup.procs delegation rule */ - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb); + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, false); if (ret) goto out_finish; - /* and must be contained in the same domain */ - ret = -EOPNOTSUPP; - if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp) - goto out_finish; - ret = cgroup_attach_task(dst_cgrp, task, false); out_finish: @@ -5864,8 +5894,7 @@ out: * @child: pointer to task_struct of forking parent process. * * A task is associated with the init_css_set until cgroup_post_fork() - * attaches it to the parent's css_set. Empty cg_list indicates that - * @child isn't holding reference to its css_set. + * attaches it to the target css_set. */ void cgroup_fork(struct task_struct *child) { @@ -5873,21 +5902,172 @@ void cgroup_fork(struct task_struct *child) INIT_LIST_HEAD(&child->cg_list); } +static struct cgroup *cgroup_get_from_file(struct file *f) +{ + struct cgroup_subsys_state *css; + struct cgroup *cgrp; + + css = css_tryget_online_from_dir(f->f_path.dentry, NULL); + if (IS_ERR(css)) + return ERR_CAST(css); + + cgrp = css->cgroup; + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + + return cgrp; +} + +/** + * cgroup_css_set_fork - find or create a css_set for a child process + * @kargs: the arguments passed to create the child process + * + * This functions finds or creates a new css_set which the child + * process will be attached to in cgroup_post_fork(). By default, + * the child process will be given the same css_set as its parent. + * + * If CLONE_INTO_CGROUP is specified this function will try to find an + * existing css_set which includes the requested cgroup and if not create + * a new css_set that the child will be attached to later. If this function + * succeeds it will hold cgroup_threadgroup_rwsem on return. If + * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex + * before grabbing cgroup_threadgroup_rwsem and will hold a reference + * to the target cgroup. + */ +static int cgroup_css_set_fork(struct kernel_clone_args *kargs) + __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem) +{ + int ret; + struct cgroup *dst_cgrp = NULL; + struct css_set *cset; + struct super_block *sb; + struct file *f; + + if (kargs->flags & CLONE_INTO_CGROUP) + mutex_lock(&cgroup_mutex); + + cgroup_threadgroup_change_begin(current); + + spin_lock_irq(&css_set_lock); + cset = task_css_set(current); + get_css_set(cset); + spin_unlock_irq(&css_set_lock); + + if (!(kargs->flags & CLONE_INTO_CGROUP)) { + kargs->cset = cset; + return 0; + } + + f = fget_raw(kargs->cgroup); + if (!f) { + ret = -EBADF; + goto err; + } + sb = f->f_path.dentry->d_sb; + + dst_cgrp = cgroup_get_from_file(f); + if (IS_ERR(dst_cgrp)) { + ret = PTR_ERR(dst_cgrp); + dst_cgrp = NULL; + goto err; + } + + if (cgroup_is_dead(dst_cgrp)) { + ret = -ENODEV; + goto err; + } + + /* + * Verify that we the target cgroup is writable for us. This is + * usually done by the vfs layer but since we're not going through + * the vfs layer here we need to do it "manually". + */ + ret = cgroup_may_write(dst_cgrp, sb); + if (ret) + goto err; + + ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, + !(kargs->flags & CLONE_THREAD)); + if (ret) + goto err; + + kargs->cset = find_css_set(cset, dst_cgrp); + if (!kargs->cset) { + ret = -ENOMEM; + goto err; + } + + put_css_set(cset); + fput(f); + kargs->cgrp = dst_cgrp; + return ret; + +err: + cgroup_threadgroup_change_end(current); + mutex_unlock(&cgroup_mutex); + if (f) + fput(f); + if (dst_cgrp) + cgroup_put(dst_cgrp); + put_css_set(cset); + if (kargs->cset) + put_css_set(kargs->cset); + return ret; +} + +/** + * cgroup_css_set_put_fork - drop references we took during fork + * @kargs: the arguments passed to create the child process + * + * Drop references to the prepared css_set and target cgroup if + * CLONE_INTO_CGROUP was requested. + */ +static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) + __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) +{ + cgroup_threadgroup_change_end(current); + + if (kargs->flags & CLONE_INTO_CGROUP) { + struct cgroup *cgrp = kargs->cgrp; + struct css_set *cset = kargs->cset; + + mutex_unlock(&cgroup_mutex); + + if (cset) { + put_css_set(cset); + kargs->cset = NULL; + } + + if (cgrp) { + cgroup_put(cgrp); + kargs->cgrp = NULL; + } + } +} + /** * cgroup_can_fork - called on a new task before the process is exposed - * @child: the task in question. + * @child: the child process * - * This calls the subsystem can_fork() callbacks. If the can_fork() callback - * returns an error, the fork aborts with that error code. This allows for - * a cgroup subsystem to conditionally allow or deny new forks. + * This prepares a new css_set for the child process which the child will + * be attached to in cgroup_post_fork(). + * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork() + * callback returns an error, the fork aborts with that error code. This + * allows for a cgroup subsystem to conditionally allow or deny new forks. */ -int cgroup_can_fork(struct task_struct *child) +int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i, j, ret; + ret = cgroup_css_set_fork(kargs); + if (ret) + return ret; + do_each_subsys_mask(ss, i, have_canfork_callback) { - ret = ss->can_fork(child); + ret = ss->can_fork(child, kargs->cset); if (ret) goto out_revert; } while_each_subsys_mask(); @@ -5899,54 +6079,64 @@ out_revert: if (j >= i) break; if (ss->cancel_fork) - ss->cancel_fork(child); + ss->cancel_fork(child, kargs->cset); } + cgroup_css_set_put_fork(kargs); + return ret; } /** * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() - * @child: the task in question + * @child: the child process + * @kargs: the arguments passed to create the child process * * This calls the cancel_fork() callbacks if a fork failed *after* - * cgroup_can_fork() succeded. + * cgroup_can_fork() succeded and cleans up references we took to + * prepare a new css_set for the child process in cgroup_can_fork(). */ -void cgroup_cancel_fork(struct task_struct *child) +void cgroup_cancel_fork(struct task_struct *child, + struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) if (ss->cancel_fork) - ss->cancel_fork(child); + ss->cancel_fork(child, kargs->cset); + + cgroup_css_set_put_fork(kargs); } /** - * cgroup_post_fork - called on a new task after adding it to the task list - * @child: the task in question - * - * Adds the task to the list running through its css_set if necessary and - * call the subsystem fork() callbacks. Has to be after the task is - * visible on the task list in case we race with the first call to - * cgroup_task_iter_start() - to guarantee that the new task ends up on its - * list. + * cgroup_post_fork - finalize cgroup setup for the child process + * @child: the child process + * + * Attach the child process to its css_set calling the subsystem fork() + * callbacks. */ -void cgroup_post_fork(struct task_struct *child) +void cgroup_post_fork(struct task_struct *child, + struct kernel_clone_args *kargs) + __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { struct cgroup_subsys *ss; struct css_set *cset; int i; + cset = kargs->cset; + kargs->cset = NULL; + spin_lock_irq(&css_set_lock); /* init tasks are special, only link regular threads */ if (likely(child->pid)) { WARN_ON_ONCE(!list_empty(&child->cg_list)); - cset = task_css_set(current); /* current is @child's parent */ - get_css_set(cset); cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); + } else { + put_css_set(cset); + cset = NULL; } /* @@ -5978,6 +6168,17 @@ void cgroup_post_fork(struct task_struct *child) do_each_subsys_mask(ss, i, have_fork_callback) { ss->fork(child); } while_each_subsys_mask(); + + /* Make the new cset the root_cset of the new cgroup namespace. */ + if (kargs->flags & CLONE_NEWCGROUP) { + struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset; + + get_css_set(cset); + child->nsproxy->cgroup_ns->root_cset = cset; + put_css_set(rcset); + } + + cgroup_css_set_put_fork(kargs); } /** @@ -6164,7 +6365,6 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path); */ struct cgroup *cgroup_get_from_fd(int fd) { - struct cgroup_subsys_state *css; struct cgroup *cgrp; struct file *f; @@ -6172,17 +6372,8 @@ struct cgroup *cgroup_get_from_fd(int fd) if (!f) return ERR_PTR(-EBADF); - css = css_tryget_online_from_dir(f->f_path.dentry, NULL); + cgrp = cgroup_get_from_file(f); fput(f); - if (IS_ERR(css)) - return ERR_CAST(css); - - cgrp = css->cgroup; - if (!cgroup_on_dfl(cgrp)) { - cgroup_put(cgrp); - return ERR_PTR(-EBADF); - } - return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_fd); @@ -6303,27 +6494,58 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ #ifdef CONFIG_CGROUP_BPF -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_prog *replace_prog, enum bpf_attach_type type, +int cgroup_bpf_attach(struct cgroup *cgrp, + struct bpf_prog *prog, struct bpf_prog *replace_prog, + struct bpf_cgroup_link *link, + enum bpf_attach_type type, u32 flags) { int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags); + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); mutex_unlock(&cgroup_mutex); return ret; } + +int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog, + struct bpf_prog *new_prog) +{ + struct bpf_cgroup_link *cg_link; + int ret; + + if (link->ops != &bpf_cgroup_link_lops) + return -EINVAL; + + cg_link = container_of(link, struct bpf_cgroup_link, link); + + mutex_lock(&cgroup_mutex); + /* link might have been auto-released by dying cgroup, so fail */ + if (!cg_link->cgroup) { + ret = -EINVAL; + goto out_unlock; + } + if (old_prog && link->prog != old_prog) { + ret = -EPERM; + goto out_unlock; + } + ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); +out_unlock: + mutex_unlock(&cgroup_mutex); + return ret; +} + int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 flags) + enum bpf_attach_type type) { int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_detach(cgrp, prog, type); + ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); mutex_unlock(&cgroup_mutex); return ret; } + int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -6381,7 +6603,10 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n"); + return snprintf(buf, PAGE_SIZE, + "nsdelegate\n" + "memory_localevents\n" + "memory_recursiveprot\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 58f5073acff7..729d3a5c772e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -358,8 +358,12 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); /* - * Cgroup v2 behavior is used when on default hierarchy or the - * cgroup_v2_mode flag is set. + * Cgroup v2 behavior is used on the "cpus" and "mems" control files when + * on default hierarchy or when the cpuset_v2_mode flag is set by mounting + * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. + * With v2 behavior, "cpus" and "mems" are always what the users have + * requested and won't be changed by hotplug events. Only the effective + * cpus or mems will be affected. */ static inline bool is_in_v2_mode(void) { diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 138059eb730d..511af87f685e 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -33,6 +33,7 @@ #include <linux/atomic.h> #include <linux/cgroup.h> #include <linux/slab.h> +#include <linux/sched/task.h> #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) #define PIDS_MAX_STR "max" @@ -214,13 +215,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies * on cgroup_threadgroup_change_begin() held by the copy_process(). */ -static int pids_can_fork(struct task_struct *task) +static int pids_can_fork(struct task_struct *task, struct css_set *cset) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; int err; - css = task_css_check(current, pids_cgrp_id, true); + if (cset) + css = cset->subsys[pids_cgrp_id]; + else + css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); err = pids_try_charge(pids, 1); if (err) { @@ -235,12 +239,15 @@ static int pids_can_fork(struct task_struct *task) return err; } -static void pids_cancel_fork(struct task_struct *task) +static void pids_cancel_fork(struct task_struct *task, struct css_set *cset) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; - css = task_css_check(current, pids_cgrp_id, true); + if (cset) + css = cset->subsys[pids_cgrp_id]; + else + css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); pids_uncharge(pids, 1); } diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index 7fa0c4ae6394..8a44b93da0f3 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -6,7 +6,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KERNEL_XZ=y # CONFIG_KERNEL_LZO is not set # CONFIG_KERNEL_LZ4 is not set -CONFIG_OPTIMIZE_INLINING=y # CONFIG_SLAB is not set # CONFIG_SLUB is not set CONFIG_SLOB=y diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 0296b4bda8f1..ce430885c26c 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -198,11 +198,13 @@ void __init context_tracking_cpu_set(int cpu) if (initialized) return; +#ifdef CONFIG_HAVE_TIF_NOHZ /* * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork * This assumes that init is the only task at this early boot stage. */ set_tsk_thread_flag(&init_task, TIF_NOHZ); +#endif WARN_ON_ONCE(!tasklist_empty()); initialized = true; diff --git a/kernel/cpu.c b/kernel/cpu.c index 9c706af713fb..2371292f30b0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -331,12 +331,12 @@ void lockdep_assert_cpus_held(void) static void lockdep_acquire_cpus_lock(void) { - rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_); + rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_); } static void lockdep_release_cpus_lock(void) { - rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, _THIS_IP_); + rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_); } /* @@ -1041,7 +1041,7 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) return _cpu_down(cpu, 0, target); } -static int do_cpu_down(unsigned int cpu, enum cpuhp_state target) +static int cpu_down(unsigned int cpu, enum cpuhp_state target) { int err; @@ -1051,11 +1051,72 @@ static int do_cpu_down(unsigned int cpu, enum cpuhp_state target) return err; } -int cpu_down(unsigned int cpu) +/** + * cpu_device_down - Bring down a cpu device + * @dev: Pointer to the cpu device to offline + * + * This function is meant to be used by device core cpu subsystem only. + * + * Other subsystems should use remove_cpu() instead. + */ +int cpu_device_down(struct device *dev) { - return do_cpu_down(cpu, CPUHP_OFFLINE); + return cpu_down(dev->id, CPUHP_OFFLINE); +} + +int remove_cpu(unsigned int cpu) +{ + int ret; + + lock_device_hotplug(); + ret = device_offline(get_cpu_device(cpu)); + unlock_device_hotplug(); + + return ret; +} +EXPORT_SYMBOL_GPL(remove_cpu); + +void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) +{ + unsigned int cpu; + int error; + + cpu_maps_update_begin(); + + /* + * Make certain the cpu I'm about to reboot on is online. + * + * This is inline to what migrate_to_reboot_cpu() already do. + */ + if (!cpu_online(primary_cpu)) + primary_cpu = cpumask_first(cpu_online_mask); + + for_each_online_cpu(cpu) { + if (cpu == primary_cpu) + continue; + + error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); + if (error) { + pr_err("Failed to offline CPU%d - error=%d", + cpu, error); + break; + } + } + + /* + * Ensure all but the reboot CPU are offline. + */ + BUG_ON(num_online_cpus() > 1); + + /* + * Make sure the CPUs won't be enabled by someone else after this + * point. Kexec will reboot to a new kernel shortly resetting + * everything along the way. + */ + cpu_hotplug_disabled++; + + cpu_maps_update_done(); } -EXPORT_SYMBOL(cpu_down); #else #define takedown_cpu NULL @@ -1124,8 +1185,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) } /* - * The caller of do_cpu_up might have raced with another - * caller. Ignore it for now. + * The caller of cpu_up() might have raced with another + * caller. Nothing to do. */ if (st->state >= target) goto out; @@ -1169,7 +1230,7 @@ out: return ret; } -static int do_cpu_up(unsigned int cpu, enum cpuhp_state target) +static int cpu_up(unsigned int cpu, enum cpuhp_state target) { int err = 0; @@ -1203,16 +1264,70 @@ out: return err; } -int cpu_up(unsigned int cpu) +/** + * cpu_device_up - Bring up a cpu device + * @dev: Pointer to the cpu device to online + * + * This function is meant to be used by device core cpu subsystem only. + * + * Other subsystems should use add_cpu() instead. + */ +int cpu_device_up(struct device *dev) +{ + return cpu_up(dev->id, CPUHP_ONLINE); +} + +int add_cpu(unsigned int cpu) +{ + int ret; + + lock_device_hotplug(); + ret = device_online(get_cpu_device(cpu)); + unlock_device_hotplug(); + + return ret; +} +EXPORT_SYMBOL_GPL(add_cpu); + +/** + * bringup_hibernate_cpu - Bring up the CPU that we hibernated on + * @sleep_cpu: The cpu we hibernated on and should be brought up. + * + * On some architectures like arm64, we can hibernate on any CPU, but on + * wake up the CPU we hibernated on might be offline as a side effect of + * using maxcpus= for example. + */ +int bringup_hibernate_cpu(unsigned int sleep_cpu) { - return do_cpu_up(cpu, CPUHP_ONLINE); + int ret; + + if (!cpu_online(sleep_cpu)) { + pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n"); + ret = cpu_up(sleep_cpu, CPUHP_ONLINE); + if (ret) { + pr_err("Failed to bring hibernate-CPU up!\n"); + return ret; + } + } + return 0; +} + +void bringup_nonboot_cpus(unsigned int setup_max_cpus) +{ + unsigned int cpu; + + for_each_present_cpu(cpu) { + if (num_online_cpus() >= setup_max_cpus) + break; + if (!cpu_online(cpu)) + cpu_up(cpu, CPUHP_ONLINE); + } } -EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -int freeze_secondary_cpus(int primary) +int __freeze_secondary_cpus(int primary, bool suspend) { int cpu, error = 0; @@ -1237,7 +1352,7 @@ int freeze_secondary_cpus(int primary) if (cpu == primary) continue; - if (pm_wakeup_pending()) { + if (suspend && pm_wakeup_pending()) { pr_info("Wakeup pending. Abort CPU freeze\n"); error = -EBUSY; break; @@ -2028,9 +2143,9 @@ static ssize_t write_cpuhp_target(struct device *dev, goto out; if (st->state < target) - ret = do_cpu_up(dev->id, target); + ret = cpu_up(dev->id, target); else - ret = do_cpu_down(dev->id, target); + ret = cpu_down(dev->id, target); out: unlock_device_hotplug(); return ret ? ret : count; diff --git a/kernel/cred.c b/kernel/cred.c index 809a985b1793..71a792616917 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -675,8 +675,6 @@ void __init cred_init(void) * The caller may change these controls afterwards if desired. * * Returns the new credentials or NULL if out of memory. - * - * Does not take, and does not return holding current->cred_replace_mutex. */ struct cred *prepare_kernel_cred(struct task_struct *daemon) { diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore index 396d12eda9e8..df259542a236 100644 --- a/kernel/debug/kdb/.gitignore +++ b/kernel/debug/kdb/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only gen-kdb_cmds.c diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index ba12e9f4661e..515379cbf209 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -399,6 +399,13 @@ int kdb_set(int argc, const char **argv) return KDB_ARGCOUNT; /* + * Censor sensitive variables + */ + if (strcmp(argv[1], "PROMPT") == 0 && + !kdb_check_flags(KDB_ENABLE_MEM_READ, kdb_cmd_enabled, false)) + return KDB_NOPERM; + + /* * Check for internal variables */ if (strcmp(argv[1], "KDBDEBUG") == 0) { @@ -1102,12 +1109,12 @@ static int handle_ctrl_cmd(char *cmd) case CTRL_P: if (cmdptr != cmd_tail) cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT; - strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); + strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); return 1; case CTRL_N: if (cmdptr != cmd_head) cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT; - strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); + strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); return 1; } return 0; @@ -1298,12 +1305,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, *(cmd_hist[cmd_head]) = '\0'; do_full_getstr: -#if defined(CONFIG_SMP) + /* PROMPT can only be set if we have MEM_READ permission. */ snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), raw_smp_processor_id()); -#else - snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT")); -#endif if (defcmd_in_progress) strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN); @@ -1314,7 +1318,7 @@ do_full_getstr: if (*cmdbuf != '\n') { if (*cmdbuf < 32) { if (cmdptr == cmd_head) { - strncpy(cmd_hist[cmd_head], cmd_cur, + strscpy(cmd_hist[cmd_head], cmd_cur, CMD_BUFLEN); *(cmd_hist[cmd_head] + strlen(cmd_hist[cmd_head])-1) = '\0'; @@ -1324,7 +1328,7 @@ do_full_getstr: cmdbuf = cmd_cur; goto do_full_getstr; } else { - strncpy(cmd_hist[cmd_head], cmd_cur, + strscpy(cmd_hist[cmd_head], cmd_cur, CMD_BUFLEN); } diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 551b0eb7028a..2a0c4985f38e 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -134,7 +134,7 @@ static void *__dma_alloc_from_coherent(struct device *dev, spin_lock_irqsave(&mem->spinlock, flags); - if (unlikely(size > (mem->size << PAGE_SHIFT))) + if (unlikely(size > ((dma_addr_t)mem->size << PAGE_SHIFT))) goto err; pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); @@ -144,8 +144,9 @@ static void *__dma_alloc_from_coherent(struct device *dev, /* * Memory was found in the coherent area. */ - *dma_handle = dma_get_device_base(dev, mem) + (pageno << PAGE_SHIFT); - ret = mem->virt_base + (pageno << PAGE_SHIFT); + *dma_handle = dma_get_device_base(dev, mem) + + ((dma_addr_t)pageno << PAGE_SHIFT); + ret = mem->virt_base + ((dma_addr_t)pageno << PAGE_SHIFT); spin_unlock_irqrestore(&mem->spinlock, flags); memset(ret, 0, size); return ret; @@ -194,7 +195,7 @@ static int __dma_release_from_coherent(struct dma_coherent_mem *mem, int order, void *vaddr) { if (mem && vaddr >= mem->virt_base && vaddr < - (mem->virt_base + (mem->size << PAGE_SHIFT))) { + (mem->virt_base + ((dma_addr_t)mem->size << PAGE_SHIFT))) { int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; unsigned long flags; @@ -238,10 +239,10 @@ static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem, struct vm_area_struct *vma, void *vaddr, size_t size, int *ret) { if (mem && vaddr >= mem->virt_base && vaddr + size <= - (mem->virt_base + (mem->size << PAGE_SHIFT))) { + (mem->virt_base + ((dma_addr_t)mem->size << PAGE_SHIFT))) { unsigned long off = vma->vm_pgoff; int start = (vaddr - mem->virt_base) >> PAGE_SHIFT; - int user_count = vma_pages(vma); + unsigned long user_count = vma_pages(vma); int count = PAGE_ALIGN(size) >> PAGE_SHIFT; *ret = -ENXIO; diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 2031ed1ad7fa..9e1777c81f55 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -137,9 +137,12 @@ static const char *const maperr2str[] = { [MAP_ERR_CHECKED] = "dma map error checked", }; -static const char *type2name[5] = { "single", "page", - "scather-gather", "coherent", - "resource" }; +static const char *type2name[] = { + [dma_debug_single] = "single", + [dma_debug_sg] = "scather-gather", + [dma_debug_coherent] = "coherent", + [dma_debug_resource] = "resource", +}; static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", "DMA_FROM_DEVICE", "DMA_NONE" }; diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index ac7956c38f69..8f4bbdaf965e 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -39,7 +39,8 @@ static inline struct page *dma_direct_to_page(struct device *dev, u64 dma_direct_get_required_mask(struct device *dev) { - u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); + phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT; + u64 max_dma = phys_to_dma_direct(dev, phys); return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } @@ -157,11 +158,8 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, ret = dma_common_contiguous_remap(page, PAGE_ALIGN(size), dma_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); - if (!ret) { - dma_free_contiguous(dev, page, size); - return ret; - } - + if (!ret) + goto out_free_pages; memset(ret, 0, size); goto done; } @@ -174,8 +172,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, * so log an error and fail. */ dev_info(dev, "Rejecting highmem page from CMA.\n"); - dma_free_contiguous(dev, page, size); - return NULL; + goto out_free_pages; } ret = page_address(page); @@ -184,10 +181,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, memset(ret, 0, size); - if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && dma_alloc_need_uncached(dev, attrs)) { arch_dma_prep_coherent(page, size); - ret = uncached_kernel_address(ret); + ret = arch_dma_set_uncached(ret, size); + if (IS_ERR(ret)) + goto out_free_pages; } done: if (force_dma_unencrypted(dev)) @@ -195,6 +194,9 @@ done: else *dma_handle = phys_to_dma(dev, page_to_phys(page)); return ret; +out_free_pages: + dma_free_contiguous(dev, page, size); + return NULL; } void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, @@ -218,6 +220,8 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) vunmap(cpu_addr); + else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) + arch_dma_clear_uncached(cpu_addr, size); dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size); } @@ -225,7 +229,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { - if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && dma_alloc_need_uncached(dev, attrs)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); @@ -235,7 +239,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { - if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && dma_alloc_need_uncached(dev, attrs)) arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 12ff766ec1fa..98e3d873792e 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -154,6 +154,8 @@ EXPORT_SYMBOL(dma_get_sgtable_attrs); */ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) { + if (force_dma_unencrypted(dev)) + prot = pgprot_decrypted(prot); if (dev_is_dma_coherent(dev) || (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) && (attrs & DMA_ATTR_NON_CONSISTENT))) diff --git a/kernel/events/core.c b/kernel/events/core.c index e453589da97c..bc9b98a9af9a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -28,6 +28,7 @@ #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hardirq.h> +#include <linux/hugetlb.h> #include <linux/rculist.h> #include <linux/uaccess.h> #include <linux/syscalls.h> @@ -49,6 +50,7 @@ #include <linux/sched/mm.h> #include <linux/proc_ns.h> #include <linux/mount.h> +#include <linux/min_heap.h> #include "internal.h" @@ -386,6 +388,7 @@ static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; +static atomic_t nr_cgroup_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -891,6 +894,47 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, rcu_read_unlock(); } +static int perf_cgroup_ensure_storage(struct perf_event *event, + struct cgroup_subsys_state *css) +{ + struct perf_cpu_context *cpuctx; + struct perf_event **storage; + int cpu, heap_size, ret = 0; + + /* + * Allow storage to have sufficent space for an iterator for each + * possibly nested cgroup plus an iterator for events with no cgroup. + */ + for (heap_size = 1; css; css = css->parent) + heap_size++; + + for_each_possible_cpu(cpu) { + cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu); + if (heap_size <= cpuctx->heap_size) + continue; + + storage = kmalloc_node(heap_size * sizeof(struct perf_event *), + GFP_KERNEL, cpu_to_node(cpu)); + if (!storage) { + ret = -ENOMEM; + break; + } + + raw_spin_lock_irq(&cpuctx->ctx.lock); + if (cpuctx->heap_size < heap_size) { + swap(cpuctx->heap, storage); + if (storage == cpuctx->heap_default) + storage = NULL; + cpuctx->heap_size = heap_size; + } + raw_spin_unlock_irq(&cpuctx->ctx.lock); + + kfree(storage); + } + + return ret; +} + static inline int perf_cgroup_connect(int fd, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) @@ -910,6 +954,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, goto out; } + ret = perf_cgroup_ensure_storage(event, css); + if (ret) + goto out; + cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; @@ -935,16 +983,10 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) event->shadow_ctx_time = now - t->timestamp; } -/* - * Update cpuctx->cgrp so that it is set when first cgroup event is added and - * cleared when last cgroup event is removed. - */ static inline void -list_update_cgroup_event(struct perf_event *event, - struct perf_event_context *ctx, bool add) +perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; - struct list_head *cpuctx_entry; if (!is_cgroup_event(event)) return; @@ -961,28 +1003,41 @@ list_update_cgroup_event(struct perf_event *event, * because if the first would mismatch, the second would not try again * and we would leave cpuctx->cgrp unset. */ - if (add && !cpuctx->cgrp) { + if (ctx->is_active && !cpuctx->cgrp) { struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) cpuctx->cgrp = cgrp; } - if (add && ctx->nr_cgroups++) + if (ctx->nr_cgroups++) + return; + + list_add(&cpuctx->cgrp_cpuctx_entry, + per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); +} + +static inline void +perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_cpu_context *cpuctx; + + if (!is_cgroup_event(event)) return; - else if (!add && --ctx->nr_cgroups) + + /* + * Because cgroup events are always per-cpu events, + * @ctx == &cpuctx->ctx. + */ + cpuctx = container_of(ctx, struct perf_cpu_context, ctx); + + if (--ctx->nr_cgroups) return; - /* no cgroup running */ - if (!add) + if (ctx->is_active && cpuctx->cgrp) cpuctx->cgrp = NULL; - cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; - if (add) - list_add(cpuctx_entry, - per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); - else - list_del(cpuctx_entry); + list_del(&cpuctx->cgrp_cpuctx_entry); } #else /* !CONFIG_CGROUP_PERF */ @@ -1048,11 +1103,14 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) } static inline void -list_update_cgroup_event(struct perf_event *event, - struct perf_event_context *ctx, bool add) +perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { } +static inline void +perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) +{ +} #endif /* @@ -1249,7 +1307,7 @@ static void put_ctx(struct perf_event_context *ctx) * function. * * Lock order: - * cred_guard_mutex + * exec_update_mutex * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; @@ -1531,6 +1589,30 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right) if (left->cpu > right->cpu) return false; +#ifdef CONFIG_CGROUP_PERF + if (left->cgrp != right->cgrp) { + if (!left->cgrp || !left->cgrp->css.cgroup) { + /* + * Left has no cgroup but right does, no cgroups come + * first. + */ + return true; + } + if (!right->cgrp || !right->cgrp->css.cgroup) { + /* + * Right has no cgroup but left does, no cgroups come + * first. + */ + return false; + } + /* Two dissimilar cgroups, order by id. */ + if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id) + return true; + + return false; + } +#endif + if (left->group_index < right->group_index) return true; if (left->group_index > right->group_index) @@ -1610,25 +1692,48 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) } /* - * Get the leftmost event in the @cpu subtree. + * Get the leftmost event in the cpu/cgroup subtree. */ static struct perf_event * -perf_event_groups_first(struct perf_event_groups *groups, int cpu) +perf_event_groups_first(struct perf_event_groups *groups, int cpu, + struct cgroup *cgrp) { struct perf_event *node_event = NULL, *match = NULL; struct rb_node *node = groups->tree.rb_node; +#ifdef CONFIG_CGROUP_PERF + u64 node_cgrp_id, cgrp_id = 0; + + if (cgrp) + cgrp_id = cgrp->kn->id; +#endif while (node) { node_event = container_of(node, struct perf_event, group_node); if (cpu < node_event->cpu) { node = node->rb_left; - } else if (cpu > node_event->cpu) { + continue; + } + if (cpu > node_event->cpu) { node = node->rb_right; - } else { - match = node_event; + continue; + } +#ifdef CONFIG_CGROUP_PERF + node_cgrp_id = 0; + if (node_event->cgrp && node_event->cgrp->css.cgroup) + node_cgrp_id = node_event->cgrp->css.cgroup->kn->id; + + if (cgrp_id < node_cgrp_id) { node = node->rb_left; + continue; } + if (cgrp_id > node_cgrp_id) { + node = node->rb_right; + continue; + } +#endif + match = node_event; + node = node->rb_left; } return match; @@ -1641,12 +1746,26 @@ static struct perf_event * perf_event_groups_next(struct perf_event *event) { struct perf_event *next; +#ifdef CONFIG_CGROUP_PERF + u64 curr_cgrp_id = 0; + u64 next_cgrp_id = 0; +#endif next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); - if (next && next->cpu == event->cpu) - return next; + if (next == NULL || next->cpu != event->cpu) + return NULL; - return NULL; +#ifdef CONFIG_CGROUP_PERF + if (event->cgrp && event->cgrp->css.cgroup) + curr_cgrp_id = event->cgrp->css.cgroup->kn->id; + + if (next->cgrp && next->cgrp->css.cgroup) + next_cgrp_id = next->cgrp->css.cgroup->kn->id; + + if (curr_cgrp_id != next_cgrp_id) + return NULL; +#endif + return next; } /* @@ -1682,13 +1801,14 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) add_event_to_groups(event, ctx); } - list_update_cgroup_event(event, ctx, true); - list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; + if (event->state > PERF_EVENT_STATE_OFF) + perf_cgroup_event_enable(event, ctx); + ctx->generation++; } @@ -1754,6 +1874,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_PHYS_ADDR) size += sizeof(data->phys_addr); + if (sample_type & PERF_SAMPLE_CGROUP) + size += sizeof(data->cgroup); + event->header_size = size; } @@ -1864,8 +1987,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - list_update_cgroup_event(event, ctx, false); - ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -1882,8 +2003,10 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) * of error state is by explicit re-enabling * of the event */ - if (event->state > PERF_EVENT_STATE_OFF) + if (event->state > PERF_EVENT_STATE_OFF) { + perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); + } ctx->generation++; } @@ -1986,6 +2109,12 @@ static int perf_get_aux_event(struct perf_event *event, return 1; } +static inline struct list_head *get_event_list(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active; +} + static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; @@ -2028,12 +2157,8 @@ static void perf_group_detach(struct perf_event *event) if (!RB_EMPTY_NODE(&event->group_node)) { add_event_to_groups(sibling, event->ctx); - if (sibling->state == PERF_EVENT_STATE_ACTIVE) { - struct list_head *list = sibling->attr.pinned ? - &ctx->pinned_active : &ctx->flexible_active; - - list_add_tail(&sibling->active_list, list); - } + if (sibling->state == PERF_EVENT_STATE_ACTIVE) + list_add_tail(&sibling->active_list, get_event_list(sibling)); } WARN_ON_ONCE(sibling->ctx != event->ctx); @@ -2112,6 +2237,7 @@ event_sched_out(struct perf_event *event, if (READ_ONCE(event->pending_disable) >= 0) { WRITE_ONCE(event->pending_disable, -1); + perf_cgroup_event_disable(event, ctx); state = PERF_EVENT_STATE_OFF; } perf_event_set_state(event, state); @@ -2182,6 +2308,7 @@ __perf_remove_from_context(struct perf_event *event, if (!ctx->nr_events && ctx->is_active) { ctx->is_active = 0; + ctx->rotate_necessary = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); cpuctx->task_ctx = NULL; @@ -2248,6 +2375,7 @@ static void __perf_event_disable(struct perf_event *event, event_sched_out(event, cpuctx, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); + perf_cgroup_event_disable(event, ctx); } /* @@ -2350,6 +2478,8 @@ event_sched_in(struct perf_event *event, { int ret = 0; + WARN_ON_ONCE(event->ctx != ctx); + lockdep_assert_held(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) @@ -2629,7 +2759,7 @@ static int __perf_install_in_context(void *info) } #ifdef CONFIG_CGROUP_PERF - if (is_cgroup_event(event)) { + if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) { /* * If the current cgroup doesn't match the event's * cgroup, we should not try to schedule it. @@ -2789,6 +2919,7 @@ static void __perf_event_enable(struct perf_event *event, ctx_sched_out(ctx, cpuctx, EVENT_TIME); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); + perf_cgroup_event_enable(event, ctx); if (!ctx->is_active) return; @@ -3077,12 +3208,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; - /* - * If we had been multiplexing, no rotations are necessary, now no events - * are active. - */ - ctx->rotate_necessary = 0; - perf_pmu_disable(ctx->pmu); if (is_active & EVENT_PINNED) { list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) @@ -3092,6 +3217,13 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (is_active & EVENT_FLEXIBLE) { list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) group_sched_out(event, cpuctx, ctx); + + /* + * Since we cleared EVENT_FLEXIBLE, also clear + * rotate_necessary, is will be reset by + * ctx_flexible_sched_in() when needed. + */ + ctx->rotate_necessary = 0; } perf_pmu_enable(ctx->pmu); } @@ -3388,71 +3520,104 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); } -static int visit_groups_merge(struct perf_event_groups *groups, int cpu, - int (*func)(struct perf_event *, void *), void *data) +static bool perf_less_group_idx(const void *l, const void *r) { - struct perf_event **evt, *evt1, *evt2; - int ret; - - evt1 = perf_event_groups_first(groups, -1); - evt2 = perf_event_groups_first(groups, cpu); + const struct perf_event *le = *(const struct perf_event **)l; + const struct perf_event *re = *(const struct perf_event **)r; - while (evt1 || evt2) { - if (evt1 && evt2) { - if (evt1->group_index < evt2->group_index) - evt = &evt1; - else - evt = &evt2; - } else if (evt1) { - evt = &evt1; - } else { - evt = &evt2; - } - - ret = func(*evt, data); - if (ret) - return ret; + return le->group_index < re->group_index; +} - *evt = perf_event_groups_next(*evt); - } +static void swap_ptr(void *l, void *r) +{ + void **lp = l, **rp = r; - return 0; + swap(*lp, *rp); } -struct sched_in_data { - struct perf_event_context *ctx; - struct perf_cpu_context *cpuctx; - int can_add_hw; +static const struct min_heap_callbacks perf_min_heap = { + .elem_size = sizeof(struct perf_event *), + .less = perf_less_group_idx, + .swp = swap_ptr, }; -static int pinned_sched_in(struct perf_event *event, void *data) +static void __heap_add(struct min_heap *heap, struct perf_event *event) { - struct sched_in_data *sid = data; + struct perf_event **itrs = heap->data; - if (event->state <= PERF_EVENT_STATE_OFF) - return 0; + if (event) { + itrs[heap->nr] = event; + heap->nr++; + } +} - if (!event_filter_match(event)) - return 0; +static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, + struct perf_event_groups *groups, int cpu, + int (*func)(struct perf_event *, void *), + void *data) +{ +#ifdef CONFIG_CGROUP_PERF + struct cgroup_subsys_state *css = NULL; +#endif + /* Space for per CPU and/or any CPU event iterators. */ + struct perf_event *itrs[2]; + struct min_heap event_heap; + struct perf_event **evt; + int ret; + + if (cpuctx) { + event_heap = (struct min_heap){ + .data = cpuctx->heap, + .nr = 0, + .size = cpuctx->heap_size, + }; - if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { - if (!group_sched_in(event, sid->cpuctx, sid->ctx)) - list_add_tail(&event->active_list, &sid->ctx->pinned_active); + lockdep_assert_held(&cpuctx->ctx.lock); + +#ifdef CONFIG_CGROUP_PERF + if (cpuctx->cgrp) + css = &cpuctx->cgrp->css; +#endif + } else { + event_heap = (struct min_heap){ + .data = itrs, + .nr = 0, + .size = ARRAY_SIZE(itrs), + }; + /* Events not within a CPU context may be on any CPU. */ + __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL)); } + evt = event_heap.data; - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL)); + +#ifdef CONFIG_CGROUP_PERF + for (; css; css = css->parent) + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup)); +#endif + + min_heapify_all(&event_heap, &perf_min_heap); + + while (event_heap.nr) { + ret = func(*evt, data); + if (ret) + return ret; + + *evt = perf_event_groups_next(*evt); + if (*evt) + min_heapify(&event_heap, 0, &perf_min_heap); + else + min_heap_pop(&event_heap, &perf_min_heap); + } return 0; } -static int flexible_sched_in(struct perf_event *event, void *data) +static int merge_sched_in(struct perf_event *event, void *data) { - struct sched_in_data *sid = data; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + int *can_add_hw = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -3460,14 +3625,19 @@ static int flexible_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { - int ret = group_sched_in(event, sid->cpuctx, sid->ctx); - if (ret) { - sid->can_add_hw = 0; - sid->ctx->rotate_necessary = 1; - return 0; + if (group_can_go_on(event, cpuctx, *can_add_hw)) { + if (!group_sched_in(event, cpuctx, ctx)) + list_add_tail(&event->active_list, get_event_list(event)); + } + + if (event->state == PERF_EVENT_STATE_INACTIVE) { + if (event->attr.pinned) { + perf_cgroup_event_disable(event, ctx); + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } - list_add_tail(&event->active_list, &sid->ctx->flexible_active); + + *can_add_hw = 0; + ctx->rotate_necessary = 1; } return 0; @@ -3477,30 +3647,28 @@ static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { - struct sched_in_data sid = { - .ctx = ctx, - .cpuctx = cpuctx, - .can_add_hw = 1, - }; + int can_add_hw = 1; + + if (ctx != &cpuctx->ctx) + cpuctx = NULL; - visit_groups_merge(&ctx->pinned_groups, + visit_groups_merge(cpuctx, &ctx->pinned_groups, smp_processor_id(), - pinned_sched_in, &sid); + merge_sched_in, &can_add_hw); } static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { - struct sched_in_data sid = { - .ctx = ctx, - .cpuctx = cpuctx, - .can_add_hw = 1, - }; + int can_add_hw = 1; + + if (ctx != &cpuctx->ctx) + cpuctx = NULL; - visit_groups_merge(&ctx->flexible_groups, + visit_groups_merge(cpuctx, &ctx->flexible_groups, smp_processor_id(), - flexible_sched_in, &sid); + merge_sched_in, &can_add_hw); } static void @@ -3841,6 +4009,12 @@ ctx_event_to_rotate(struct perf_event_context *ctx) typeof(*event), group_node); } + /* + * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() + * finds there are unschedulable events, it will set it again. + */ + ctx->rotate_necessary = 0; + return event; } @@ -4456,6 +4630,8 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_comm_events); if (event->attr.namespaces) atomic_dec(&nr_namespaces_events); + if (event->attr.cgroup) + atomic_dec(&nr_cgroup_events); if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) @@ -6555,6 +6731,11 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, event, enabled, running); } +static inline bool perf_sample_save_hw_index(struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; +} + void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -6643,6 +6824,8 @@ void perf_output_sample(struct perf_output_handle *handle, * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); + if (perf_sample_save_hw_index(event)) + perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { /* @@ -6705,6 +6888,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_PHYS_ADDR) perf_output_put(handle, data->phys_addr); + if (sample_type & PERF_SAMPLE_CGROUP) + perf_output_put(handle, data->cgroup); + if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); @@ -6748,9 +6934,12 @@ static u64 perf_virt_to_phys(u64 virt) * Try IRQ-safe __get_user_pages_fast first. * If failed, leave phys_addr as 0. */ - if ((current->mm != NULL) && - (__get_user_pages_fast(virt, 1, 0, &p) == 1)) - phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + if (current->mm != NULL) { + pagefault_disable(); + if (__get_user_pages_fast(virt, 1, 0, &p) == 1) + phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + pagefault_enable(); + } if (p) put_page(p); @@ -6836,6 +7025,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ if (data->br_stack) { + if (perf_sample_save_hw_index(event)) + size += sizeof(u64); + size += data->br_stack->nr * sizeof(struct perf_branch_entry); } @@ -6901,6 +7093,16 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); +#ifdef CONFIG_CGROUP_PERF + if (sample_type & PERF_SAMPLE_CGROUP) { + struct cgroup *cgrp; + + /* protected by RCU */ + cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup; + data->cgroup = cgroup_id(cgrp); + } +#endif + if (sample_type & PERF_SAMPLE_AUX) { u64 size; @@ -7574,6 +7776,105 @@ void perf_event_namespaces(struct task_struct *task) } /* + * cgroup tracking + */ +#ifdef CONFIG_CGROUP_PERF + +struct perf_cgroup_event { + char *path; + int path_size; + struct { + struct perf_event_header header; + u64 id; + char path[]; + } event_id; +}; + +static int perf_event_cgroup_match(struct perf_event *event) +{ + return event->attr.cgroup; +} + +static void perf_event_cgroup_output(struct perf_event *event, void *data) +{ + struct perf_cgroup_event *cgroup_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + u16 header_size = cgroup_event->event_id.header.size; + int ret; + + if (!perf_event_cgroup_match(event)) + return; + + perf_event_header__init_id(&cgroup_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + cgroup_event->event_id.header.size); + if (ret) + goto out; + + perf_output_put(&handle, cgroup_event->event_id); + __output_copy(&handle, cgroup_event->path, cgroup_event->path_size); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + cgroup_event->event_id.header.size = header_size; +} + +static void perf_event_cgroup(struct cgroup *cgrp) +{ + struct perf_cgroup_event cgroup_event; + char path_enomem[16] = "//enomem"; + char *pathname; + size_t size; + + if (!atomic_read(&nr_cgroup_events)) + return; + + cgroup_event = (struct perf_cgroup_event){ + .event_id = { + .header = { + .type = PERF_RECORD_CGROUP, + .misc = 0, + .size = sizeof(cgroup_event.event_id), + }, + .id = cgroup_id(cgrp), + }, + }; + + pathname = kmalloc(PATH_MAX, GFP_KERNEL); + if (pathname == NULL) { + cgroup_event.path = path_enomem; + } else { + /* just to be sure to have enough space for alignment */ + cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64)); + cgroup_event.path = pathname; + } + + /* + * Since our buffer works in 8 byte units we need to align our string + * size to a multiple of 8. However, we must guarantee the tail end is + * zero'd out to avoid leaking random bits to userspace. + */ + size = strlen(cgroup_event.path) + 1; + while (!IS_ALIGNED(size, sizeof(u64))) + cgroup_event.path[size++] = '\0'; + + cgroup_event.event_id.header.size += size; + cgroup_event.path_size = size; + + perf_iterate_sb(perf_event_cgroup_output, + &cgroup_event, + NULL); + + kfree(pathname); +} + +#endif + +/* * mmap tracking */ @@ -7693,7 +7994,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) flags |= MAP_EXECUTABLE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; - if (vma->vm_flags & VM_HUGETLB) + if (is_vm_hugetlb_page(vma)) flags |= MAP_HUGETLB; if (file) { @@ -8255,23 +8556,22 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, enum perf_bpf_event_type type) { bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; - char sym[KSYM_NAME_LEN]; int i; if (prog->aux->func_cnt == 0) { - bpf_get_prog_name(prog, sym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)prog->bpf_func, - prog->jited_len, unregister, sym); + prog->jited_len, unregister, + prog->aux->ksym.name); } else { for (i = 0; i < prog->aux->func_cnt; i++) { struct bpf_prog *subprog = prog->aux->func[i]; - bpf_get_prog_name(subprog, sym); perf_event_ksymbol( PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)subprog->bpf_func, - subprog->jited_len, unregister, sym); + subprog->jited_len, unregister, + prog->aux->ksym.name); } } } @@ -9206,7 +9506,6 @@ static void bpf_overflow_handler(struct perf_event *event, int ret = 0; ctx.regs = perf_arch_bpf_user_pt_regs(regs); - preempt_disable(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); @@ -9214,7 +9513,6 @@ static void bpf_overflow_handler(struct perf_event *event, rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); - preempt_enable(); if (!ret) return; @@ -10349,6 +10647,9 @@ skip_type: cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); __perf_mux_hrtimer_init(cpuctx, cpu); + + cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); + cpuctx->heap = cpuctx->heap_default; } got_cpu_context: @@ -10616,6 +10917,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_comm_events); if (event->attr.namespaces) atomic_inc(&nr_namespaces_events); + if (event->attr.cgroup) + atomic_inc(&nr_cgroup_events); if (event->attr.task) atomic_inc(&nr_task_events); if (event->attr.freq) @@ -10794,12 +11097,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; - if (cgroup_fd != -1) { - err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); - if (err) - goto err_ns; - } - pmu = perf_init_event(event); if (IS_ERR(pmu)) { err = PTR_ERR(pmu); @@ -10821,6 +11118,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_pmu; } + if (cgroup_fd != -1) { + err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); + if (err) + goto err_pmu; + } + err = exclusive_event_init(event); if (err) goto err_pmu; @@ -10881,12 +11184,12 @@ err_per_task: exclusive_event_destroy(event); err_pmu: + if (is_cgroup_event(event)) + perf_detach_cgroup(event); if (event->destroy) event->destroy(event); module_put(pmu->module); err_ns: - if (is_cgroup_event(event)) - perf_detach_cgroup(event); if (event->ns) put_pid_ns(event->ns); if (event->hw.target) @@ -10995,6 +11298,12 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->sample_type & PERF_SAMPLE_REGS_INTR) ret = perf_reg_validate(attr->sample_regs_intr); + +#ifndef CONFIG_CGROUP_PERF + if (attr->sample_type & PERF_SAMPLE_CGROUP) + return -EINVAL; +#endif + out: return ret; @@ -11263,14 +11572,14 @@ SYSCALL_DEFINE5(perf_event_open, } if (task) { - err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); + err = mutex_lock_interruptible(&task->signal->exec_update_mutex); if (err) goto err_task; /* * Reuse ptrace permission checks for now. * - * We must hold cred_guard_mutex across this and any potential + * We must hold exec_update_mutex across this and any potential * perf_install_in_context() call for this new event to * serialize against exec() altering our credentials (and the * perf_event_exit_task() that could imply). @@ -11559,7 +11868,7 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(&ctx->mutex); if (task) { - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); put_task_struct(task); } @@ -11595,7 +11904,7 @@ err_alloc: free_event(event); err_cred: if (task) - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); err_task: if (task) put_task_struct(task); @@ -11900,7 +12209,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) /* * When a child task exits, feed back event values to parent events. * - * Can be called with cred_guard_mutex held when called from + * Can be called with exec_update_mutex held when called from * install_exec_creds(). */ void perf_event_exit_task(struct task_struct *child) @@ -12592,6 +12901,12 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css) kfree(jc); } +static int perf_cgroup_css_online(struct cgroup_subsys_state *css) +{ + perf_event_cgroup(css->cgroup); + return 0; +} + static int __perf_cgroup_move(void *info) { struct task_struct *task = info; @@ -12613,6 +12928,7 @@ static void perf_cgroup_attach(struct cgroup_taskset *tset) struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, + .css_online = perf_cgroup_css_online, .attach = perf_cgroup_attach, /* * Implicitly enable on dfl hierarchy so that perf events can diff --git a/kernel/exit.c b/kernel/exit.c index 0b81b26a872a..389a88cb3081 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -103,17 +103,8 @@ static void __exit_signal(struct task_struct *tsk) #ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); - if (group_dead) { + if (group_dead) posix_cpu_timers_exit_group(tsk); - } else { - /* - * This can only happen if the caller is de_thread(). - * FIXME: this is the temporary hack, we should teach - * posix-cpu-timers to handle this case correctly. - */ - if (unlikely(has_group_leader_pid(tsk))) - posix_cpu_timers_exit_group(tsk); - } #endif if (group_dead) { @@ -191,6 +182,7 @@ void put_task_struct_rcu_user(struct task_struct *task) void release_task(struct task_struct *p) { struct task_struct *leader; + struct pid *thread_pid; int zap_leader; repeat: /* don't need to get the RCU readlock here - the process is dead and @@ -199,11 +191,11 @@ repeat: atomic_dec(&__task_cred(p)->user->processes); rcu_read_unlock(); - proc_flush_task(p); cgroup_release(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); + thread_pid = get_pid(p->thread_pid); __exit_signal(p); /* @@ -226,6 +218,7 @@ repeat: } write_unlock_irq(&tasklist_lock); + proc_flush_pid(thread_pid); release_thread(p); put_task_struct_rcu_user(p); @@ -258,6 +251,7 @@ void rcuwait_wake_up(struct rcuwait *w) wake_up_process(task); rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX diff --git a/kernel/extable.c b/kernel/extable.c index a0024f27d3a1..b0ea5eb0c3b4 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -34,7 +34,8 @@ u32 __initdata __visible main_extable_sort_needed = 1; /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) { - if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) { + if (main_extable_sort_needed && + &__stop___ex_table > &__start___ex_table) { pr_notice("Sorting __ex_table...\n"); sort_extable(__start___ex_table, __stop___ex_table); } @@ -149,8 +150,6 @@ int kernel_text_address(unsigned long addr) goto out; if (is_bpf_text_address(addr)) goto out; - if (is_bpf_image_address(addr)) - goto out; ret = 0; out: if (no_rcu) diff --git a/kernel/fork.c b/kernel/fork.c index 86425305cd4a..96eb4b535ced 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -281,7 +281,7 @@ static inline void free_thread_stack(struct task_struct *tsk) MEMCG_KERNEL_STACK_KB, -(int)(PAGE_SIZE / 1024)); - memcg_kmem_uncharge(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm->pages[i], 0); } for (i = 0; i < NR_CACHED_STACKS; i++) { @@ -361,6 +361,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) if (new) { *new = *orig; INIT_LIST_HEAD(&new->anon_vma_chain); + new->vm_next = new->vm_prev = NULL; } return new; } @@ -397,8 +398,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, THREAD_SIZE / 1024 * account); - mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } } @@ -413,12 +414,13 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk) for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { /* - * If memcg_kmem_charge() fails, page->mem_cgroup - * pointer is NULL, and both memcg_kmem_uncharge() + * If memcg_kmem_charge_page() fails, page->mem_cgroup + * pointer is NULL, and both memcg_kmem_uncharge_page() * and mod_memcg_page_state() in free_thread_stack() * will ignore this page. So it's safe. */ - ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0); + ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, + 0); if (ret) return ret; @@ -552,14 +554,15 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (retval) goto fail_nomem_anon_vma_fork; if (tmp->vm_flags & VM_WIPEONFORK) { - /* VM_WIPEONFORK gets a clean slate in the child. */ + /* + * VM_WIPEONFORK gets a clean slate in the child. + * Don't prepare anon_vma until fault since we don't + * copy page for current vma. + */ tmp->anon_vma = NULL; - if (anon_vma_prepare(tmp)) - goto fail_nomem_anon_vma_fork; } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); - tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); @@ -1224,7 +1227,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) struct mm_struct *mm; int err; - err = mutex_lock_killable(&task->signal->cred_guard_mutex); + err = mutex_lock_killable(&task->signal->exec_update_mutex); if (err) return ERR_PTR(err); @@ -1234,7 +1237,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mmput(mm); mm = ERR_PTR(-EACCES); } - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); return mm; } @@ -1594,6 +1597,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); + mutex_init(&sig->exec_update_mutex); return 0; } @@ -1679,6 +1683,11 @@ static inline void rcu_copy_process(struct task_struct *p) INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); p->rcu_tasks_idle_cpu = -1; #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + p->trc_reader_nesting = 0; + p->trc_reader_special.s = 0; + INIT_LIST_HEAD(&p->trc_holdout_list); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } struct pid *pidfd_pid(const struct file *file) @@ -2174,16 +2183,15 @@ static __latent_entropy struct task_struct *copy_process( INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; - cgroup_threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed * between here and cgroup_post_fork() if an organisation operation is in * progress. */ - retval = cgroup_can_fork(p); + retval = cgroup_can_fork(p, args); if (retval) - goto bad_fork_cgroup_threadgroup_change_end; + goto bad_fork_put_pidfd; /* * From this point on we must avoid any synchronous user-space @@ -2288,8 +2296,7 @@ static __latent_entropy struct task_struct *copy_process( write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - cgroup_post_fork(p); - cgroup_threadgroup_change_end(current); + cgroup_post_fork(p, args); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -2300,9 +2307,7 @@ static __latent_entropy struct task_struct *copy_process( bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); - cgroup_cancel_fork(p); -bad_fork_cgroup_threadgroup_change_end: - cgroup_threadgroup_change_end(current); + cgroup_cancel_fork(p, args); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { fput(pidfile); @@ -2605,6 +2610,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args args; pid_t *kset_tid = kargs->set_tid; + BUILD_BUG_ON(offsetofend(struct clone_args, tls) != + CLONE_ARGS_SIZE_VER0); + BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) != + CLONE_ARGS_SIZE_VER1); + BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) != + CLONE_ARGS_SIZE_VER2); + BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2); + if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) @@ -2631,6 +2644,10 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, !valid_signal(args.exit_signal))) return -EINVAL; + if ((args.flags & CLONE_INTO_CGROUP) && + (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2)) + return -EINVAL; + *kargs = (struct kernel_clone_args){ .flags = args.flags, .pidfd = u64_to_user_ptr(args.pidfd), @@ -2641,6 +2658,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, .stack_size = args.stack_size, .tls = args.tls, .set_tid_size = args.set_tid_size, + .cgroup = args.cgroup, }; if (args.set_tid && @@ -2684,7 +2702,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ - if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND)) + if (kargs->flags & + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) return false; /* diff --git a/kernel/futex.c b/kernel/futex.c index 82dfacb3250e..b59532862bc0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -135,8 +135,7 @@ * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see hb_waiters_inc) and where (B) orders the write - * to futex and the waiters read -- this is done by the barriers for both - * shared and private futexes in get_futex_key_refs(). + * to futex and the waiters read (see hb_waiters_pending()). * * This yields the following case (where X:=waiters, Y:=futex): * @@ -331,17 +330,6 @@ static void compat_exit_robust_list(struct task_struct *curr); static inline void compat_exit_robust_list(struct task_struct *curr) { } #endif -static inline void futex_get_mm(union futex_key *key) -{ - mmgrab(key->private.mm); - /* - * Ensure futex_get_mm() implies a full barrier such that - * get_futex_key() implies a full barrier. This is relied upon - * as smp_mb(); (B), see the ordering comment above. - */ - smp_mb__after_atomic(); -} - /* * Reflects a new waiter being added to the waitqueue. */ @@ -370,6 +358,10 @@ static inline void hb_waiters_dec(struct futex_hash_bucket *hb) static inline int hb_waiters_pending(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP + /* + * Full barrier (B), see the ordering comment above. + */ + smp_mb(); return atomic_read(&hb->waiters); #else return 1; @@ -407,69 +399,6 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) && key1->both.offset == key2->both.offset); } -/* - * Take a reference to the resource addressed by a key. - * Can be called while holding spinlocks. - * - */ -static void get_futex_key_refs(union futex_key *key) -{ - if (!key->both.ptr) - return; - - /* - * On MMU less systems futexes are always "private" as there is no per - * process address space. We need the smp wmb nevertheless - yes, - * arch/blackfin has MMU less SMP ... - */ - if (!IS_ENABLED(CONFIG_MMU)) { - smp_mb(); /* explicit smp_mb(); (B) */ - return; - } - - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - smp_mb(); /* explicit smp_mb(); (B) */ - break; - case FUT_OFF_MMSHARED: - futex_get_mm(key); /* implies smp_mb(); (B) */ - break; - default: - /* - * Private futexes do not hold reference on an inode or - * mm, therefore the only purpose of calling get_futex_key_refs - * is because we need the barrier for the lockless waiter check. - */ - smp_mb(); /* explicit smp_mb(); (B) */ - } -} - -/* - * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. This is - * a no-op for private futexes, see comment in the get - * counterpart. - */ -static void drop_futex_key_refs(union futex_key *key) -{ - if (!key->both.ptr) { - /* If we're here then we tried to put a key we failed to get */ - WARN_ON_ONCE(1); - return; - } - - if (!IS_ENABLED(CONFIG_MMU)) - return; - - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - break; - case FUT_OFF_MMSHARED: - mmdrop(key->private.mm); - break; - } -} - enum futex_access { FUTEX_READ, FUTEX_WRITE @@ -601,7 +530,6 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a if (!fshared) { key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies smp_mb(); (B) */ return 0; } @@ -741,8 +669,6 @@ again: rcu_read_unlock(); } - get_futex_key_refs(key); /* implies smp_mb(); (B) */ - out: put_page(page); return err; @@ -750,7 +676,6 @@ out: static inline void put_futex_key(union futex_key *key) { - drop_futex_key_refs(key); } /** @@ -1740,10 +1665,9 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) oparg = 1 << oparg; } - if (!access_ok(uaddr, sizeof(u32))) - return -EFAULT; - + pagefault_disable(); ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); + pagefault_enable(); if (ret) return ret; @@ -1885,7 +1809,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; } - get_futex_key_refs(key2); q->key = *key2; } @@ -1907,7 +1830,6 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - get_futex_key_refs(key); q->key = *key; __unqueue_futex(q); @@ -2018,7 +1940,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 *cmpval, int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; - int drop_count = 0, task_count = 0, ret; + int task_count = 0, ret; struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; @@ -2139,7 +2061,6 @@ retry_private: */ if (ret > 0) { WARN_ON(pi_state); - drop_count++; task_count++; /* * If we acquired the lock, then the user space value @@ -2259,7 +2180,6 @@ retry_private: * doing so. */ requeue_pi_wake_futex(this, &key2, hb2); - drop_count++; continue; } else if (ret) { /* @@ -2280,7 +2200,6 @@ retry_private: } } requeue_futex(this, hb1, hb2, &key2); - drop_count++; } /* @@ -2295,15 +2214,6 @@ out_unlock: wake_up_q(&wake_q); hb_waiters_dec(hb2); - /* - * drop_futex_key_refs() must be called outside the spinlocks. During - * the requeue we moved futex_q's from the hash bucket at key1 to the - * one at key2 and updated their key pointer. We no longer need to - * hold the references to key1. - */ - while (--drop_count >= 0) - drop_futex_key_refs(&key1); - out_put_keys: put_futex_key(&key2); out_put_key1: @@ -2433,7 +2343,6 @@ retry: ret = 1; } - drop_futex_key_refs(&q->key); return ret; } diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index e5eb5ea7ea59..82babf5aa077 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -58,7 +58,7 @@ struct gcov_node { struct dentry *dentry; struct dentry **links; int num_loaded; - char name[0]; + char name[]; }; static const char objtree[] = OBJTREE; @@ -108,9 +108,9 @@ static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos) { struct gcov_iterator *iter = data; + (*pos)++; if (gcov_iter_next(iter)) return NULL; - (*pos)++; return iter; } diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 801ee4b0b969..acb83558e5df 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -38,7 +38,7 @@ static struct gcov_info *gcov_info_head; struct gcov_fn_info { unsigned int ident; unsigned int checksum; - unsigned int n_ctrs[0]; + unsigned int n_ctrs[]; }; /** @@ -78,7 +78,7 @@ struct gcov_info { unsigned int n_functions; const struct gcov_fn_info *functions; unsigned int ctr_mask; - struct gcov_ctr_info counts[0]; + struct gcov_ctr_info counts[]; }; /** @@ -352,7 +352,7 @@ struct gcov_iterator { unsigned int count; int num_types; - struct type_info type_info[0]; + struct type_info type_info[]; }; static struct gcov_fn_info *get_func(struct gcov_iterator *iter) diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index ec37563674d6..908fdf5098c3 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -68,7 +68,7 @@ struct gcov_fn_info { unsigned int ident; unsigned int lineno_checksum; unsigned int cfg_checksum; - struct gcov_ctr_info ctrs[0]; + struct gcov_ctr_info ctrs[]; }; /** diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index f92d9a687372..20d501af4f2e 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -43,6 +43,10 @@ config GENERIC_IRQ_MIGRATION config AUTO_IRQ_AFFINITY bool +# Interrupt injection mechanism +config GENERIC_IRQ_INJECTION + bool + # Tasklet based software resend for pending interrupts on enable_irq() config HARDIRQS_SW_RESEND bool @@ -127,6 +131,7 @@ config SPARSE_IRQ config GENERIC_IRQ_DEBUGFS bool "Expose irq internals in debugfs" depends on DEBUG_FS + select GENERIC_IRQ_INJECTION default n ---help--- diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b3fa2d87d2f3..41e7e37a0928 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -278,7 +278,7 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) } } if (resend) - check_irq_resend(desc); + check_irq_resend(desc, false); return ret; } diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index a949bd39e343..4f9f844074db 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -190,33 +190,7 @@ static ssize_t irq_debug_write(struct file *file, const char __user *user_buf, return -EFAULT; if (!strncmp(buf, "trigger", size)) { - unsigned long flags; - int err; - - /* Try the HW interface first */ - err = irq_set_irqchip_state(irq_desc_get_irq(desc), - IRQCHIP_STATE_PENDING, true); - if (!err) - return count; - - /* - * Otherwise, try to inject via the resend interface, - * which may or may not succeed. - */ - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); - - if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) { - /* Can't do level nor NMIs, sorry */ - err = -EINVAL; - } else { - desc->istate |= IRQS_PENDING; - check_irq_resend(desc); - err = 0; - } - - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); + int err = irq_inject_interrupt(irq_desc_get_irq(desc)); return err ? err : count; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a4ace611f47f..a8e14c80b405 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -145,6 +145,13 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags for_each_action_of_desc(desc, action) { irqreturn_t res; + /* + * If this IRQ would be threaded under force_irqthreads, mark it so. + */ + if (irq_settings_can_thread(desc) && + !(action->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))) + lockdep_hardirq_threaded(); + trace_irq_handler_entry(irq, action); res = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, res); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c9d8eb7f5c02..7db284b10ac9 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -108,7 +108,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ -void check_irq_resend(struct irq_desc *desc); +int check_irq_resend(struct irq_desc *desc, bool inject); bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); @@ -425,6 +425,10 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) { return desc->pending_mask; } +static inline bool handle_enforce_irqctx(struct irq_data *data) +{ + return irqd_is_handle_enforce_irqctx(data); +} bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) @@ -451,6 +455,10 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) { return false; } +static inline bool handle_enforce_irqctx(struct irq_data *data) +{ + return false; +} #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 98a5f10d1900..1a7723604399 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -638,9 +638,15 @@ void irq_init_desc(unsigned int irq) int generic_handle_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); + struct irq_data *data; if (!desc) return -EINVAL; + + data = irq_desc_get_irq_data(desc); + if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data))) + return -EPERM; + generic_handle_irq_desc(desc); return 0; } diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 7527e5ef6fe5..35b8d97c3a1d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -46,11 +46,11 @@ const struct fwnode_operations irqchip_fwnode_ops; EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); /** - * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for + * __irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for * identifying an irq domain * @type: Type of irqchip_fwnode. See linux/irqdomain.h - * @name: Optional user provided domain name * @id: Optional user provided id if name != NULL + * @name: Optional user provided domain name * @pa: Optional user-provided physical address * * Allocate a struct irqchip_fwid, and return a poiner to the embedded @@ -1310,6 +1310,11 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs, void *arg) { + if (!domain->ops->alloc) { + pr_debug("domain->ops->alloc() is NULL\n"); + return -ENOSYS; + } + return domain->ops->alloc(domain, irq_base, nr_irqs, arg); } @@ -1347,11 +1352,6 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, return -EINVAL; } - if (!domain->ops->alloc) { - pr_debug("domain->ops->alloc() is NULL\n"); - return -ENOSYS; - } - if (realloc && irq_base >= 0) { virq = irq_base; } else { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7eee98c38f25..453a8a0f4804 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -323,7 +323,11 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); - schedule_work(&desc->affinity_notify->work); + if (!schedule_work(&desc->affinity_notify->work)) { + /* Work was already scheduled, drop our extra ref */ + kref_put(&desc->affinity_notify->kref, + desc->affinity_notify->release); + } } irqd_set(data, IRQD_AFFINITY_SET); @@ -423,7 +427,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) raw_spin_unlock_irqrestore(&desc->lock, flags); if (old_notify) { - cancel_work_sync(&old_notify->work); + if (cancel_work_sync(&old_notify->work)) { + /* Pending work had a ref, put that one too */ + kref_put(&old_notify->kref, old_notify->release); + } kref_put(&old_notify->kref, old_notify->release); } @@ -1683,34 +1690,6 @@ out_mput: return ret; } -/** - * setup_irq - setup an interrupt - * @irq: Interrupt line to setup - * @act: irqaction for the interrupt - * - * Used to statically setup interrupts in the early boot process. - */ -int setup_irq(unsigned int irq, struct irqaction *act) -{ - int retval; - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) - return -EINVAL; - - retval = irq_chip_pm_get(&desc->irq_data); - if (retval < 0) - return retval; - - retval = __setup_irq(irq, desc, act); - - if (retval) - irq_chip_pm_put(&desc->irq_data); - - return retval; -} -EXPORT_SYMBOL_GPL(setup_irq); - /* * Internal function to unregister an irqaction - used to free * regular and special interrupts that are part of the architecture. @@ -1852,22 +1831,6 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) } /** - * remove_irq - free an interrupt - * @irq: Interrupt line to free - * @act: irqaction for the interrupt - * - * Used to remove interrupts statically setup by the early boot process. - */ -void remove_irq(unsigned int irq, struct irqaction *act) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc))) - __free_irq(desc, act->dev_id); -} -EXPORT_SYMBOL_GPL(remove_irq); - -/** * free_irq - free an interrupt allocated with request_irq * @irq: Interrupt line to free * @dev_id: Device identity to free diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 98c04ca5fa43..27634f4022d0 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -47,6 +47,43 @@ static void resend_irqs(unsigned long arg) /* Tasklet to handle resend: */ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); +static int irq_sw_resend(struct irq_desc *desc) +{ + unsigned int irq = irq_desc_get_irq(desc); + + /* + * Validate whether this interrupt can be safely injected from + * non interrupt context + */ + if (handle_enforce_irqctx(&desc->irq_data)) + return -EINVAL; + + /* + * If the interrupt is running in the thread context of the parent + * irq we need to be careful, because we cannot trigger it + * directly. + */ + if (irq_settings_is_nested_thread(desc)) { + /* + * If the parent_irq is valid, we retrigger the parent, + * otherwise we do nothing. + */ + if (!desc->parent_irq) + return -EINVAL; + irq = desc->parent_irq; + } + + /* Set it pending and activate the softirq: */ + set_bit(irq, irqs_resend); + tasklet_schedule(&resend_tasklet); + return 0; +} + +#else +static int irq_sw_resend(struct irq_desc *desc) +{ + return -EINVAL; +} #endif /* @@ -54,49 +91,83 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); * * Is called with interrupts disabled and desc->lock held. */ -void check_irq_resend(struct irq_desc *desc) +int check_irq_resend(struct irq_desc *desc, bool inject) { + int err = 0; + /* - * We do not resend level type interrupts. Level type - * interrupts are resent by hardware when they are still - * active. Clear the pending bit so suspend/resume does not - * get confused. + * We do not resend level type interrupts. Level type interrupts + * are resent by hardware when they are still active. Clear the + * pending bit so suspend/resume does not get confused. */ if (irq_settings_is_level(desc)) { desc->istate &= ~IRQS_PENDING; - return; + return -EINVAL; } + if (desc->istate & IRQS_REPLAY) - return; - if (desc->istate & IRQS_PENDING) { - desc->istate &= ~IRQS_PENDING; + return -EBUSY; + + if (!(desc->istate & IRQS_PENDING) && !inject) + return 0; + + desc->istate &= ~IRQS_PENDING; + + if (!desc->irq_data.chip->irq_retrigger || + !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) + err = irq_sw_resend(desc); + + /* If the retrigger was successfull, mark it with the REPLAY bit */ + if (!err) desc->istate |= IRQS_REPLAY; + return err; +} - if (!desc->irq_data.chip->irq_retrigger || - !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { -#ifdef CONFIG_HARDIRQS_SW_RESEND - unsigned int irq = irq_desc_get_irq(desc); - - /* - * If the interrupt is running in the thread - * context of the parent irq we need to be - * careful, because we cannot trigger it - * directly. - */ - if (irq_settings_is_nested_thread(desc)) { - /* - * If the parent_irq is valid, we - * retrigger the parent, otherwise we - * do nothing. - */ - if (!desc->parent_irq) - return; - irq = desc->parent_irq; - } - /* Set it pending and activate the softirq: */ - set_bit(irq, irqs_resend); - tasklet_schedule(&resend_tasklet); -#endif - } - } +#ifdef CONFIG_GENERIC_IRQ_INJECTION +/** + * irq_inject_interrupt - Inject an interrupt for testing/error injection + * @irq: The interrupt number + * + * This function must only be used for debug and testing purposes! + * + * Especially on x86 this can cause a premature completion of an interrupt + * affinity change causing the interrupt line to become stale. Very + * unlikely, but possible. + * + * The injection can fail for various reasons: + * - Interrupt is not activated + * - Interrupt is NMI type or currently replaying + * - Interrupt is level type + * - Interrupt does not support hardware retrigger and software resend is + * either not enabled or not possible for the interrupt. + */ +int irq_inject_interrupt(unsigned int irq) +{ + struct irq_desc *desc; + unsigned long flags; + int err; + + /* Try the state injection hardware interface first */ + if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true)) + return 0; + + /* That failed, try via the resend mechanism */ + desc = irq_get_desc_buslock(irq, &flags, 0); + if (!desc) + return -EINVAL; + + /* + * Only try to inject when the interrupt is: + * - not NMI type + * - activated + */ + if ((desc->istate & IRQS_NMI) || !irqd_is_activated(&desc->irq_data)) + err = -EINVAL; + else + err = check_irq_resend(desc, true); + + irq_put_desc_busunlock(desc, flags); + return err; } +EXPORT_SYMBOL_GPL(irq_inject_interrupt); +#endif diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 828cc30774bc..48b5d1b6af4d 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -153,7 +153,9 @@ static void irq_work_run_list(struct llist_head *list) */ flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); + lockdep_irq_work_enter(work); work->func(work); + lockdep_irq_work_exit(work); /* * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index a9b3f660dee7..16c8c605f4b0 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -175,7 +175,6 @@ unsigned long kallsyms_lookup_name(const char *name) } return module_kallsyms_lookup_name(name); } -EXPORT_SYMBOL_GPL(kallsyms_lookup_name); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), @@ -194,7 +193,6 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, } return module_kallsyms_on_each_symbol(fn, data); } -EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol); static unsigned long get_symbol_pos(unsigned long addr, unsigned long *symbolsize, diff --git a/kernel/kcmp.c b/kernel/kcmp.c index a0e3d7a0e8b8..b3ff9288c6cc 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -173,8 +173,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, /* * One should have enough rights to inspect task details. */ - ret = kcmp_lock(&task1->signal->cred_guard_mutex, - &task2->signal->cred_guard_mutex); + ret = kcmp_lock(&task1->signal->exec_update_mutex, + &task2->signal->exec_update_mutex); if (ret) goto err; if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || @@ -229,8 +229,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, } err_unlock: - kcmp_unlock(&task1->signal->cred_guard_mutex, - &task2->signal->cred_guard_mutex); + kcmp_unlock(&task1->signal->exec_update_mutex, + &task2->signal->exec_update_mutex); err: put_task_struct(task1); put_task_struct(task2); diff --git a/kernel/kmod.c b/kernel/kmod.c index bc6addd9152b..37c3c4b97b8e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -35,7 +35,7 @@ * (u64) THREAD_SIZE * 8UL); * * If you need less than 50 threads would mean we're dealing with systems - * smaller than 3200 pages. This assuems you are capable of having ~13M memory, + * smaller than 3200 pages. This assumes you are capable of having ~13M memory, * and this would only be an be an upper limit, after which the OOM killer * would take effect. Systems like these are very unlikely if modules are * enabled. @@ -120,7 +120,7 @@ out: * invoke it. * * If module auto-loading support is disabled then this function - * becomes a no-operation. + * simply returns -ENOENT. */ int __request_module(bool wait, const char *fmt, ...) { @@ -137,7 +137,7 @@ int __request_module(bool wait, const char *fmt, ...) WARN_ON_ONCE(wait && current_is_async()); if (!modprobe_path[0]) - return 0; + return -ENOENT; va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); diff --git a/kernel/kthread.c b/kernel/kthread.c index b262f47046ca..bfbfa481be3a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -199,8 +199,15 @@ static void __kthread_parkme(struct kthread *self) if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; + /* + * Thread is going to call schedule(), do not preempt it, + * or the caller of kthread_park() may spend more time in + * wait_task_inactive(). + */ + preempt_disable(); complete(&self->parked); - schedule(); + schedule_preempt_disabled(); + preempt_enable(); } __set_current_state(TASK_RUNNING); } @@ -245,8 +252,14 @@ static int kthread(void *_create) /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; + /* + * Thread is going to call schedule(), do not preempt it, + * or the creator may spend more time in wait_task_inactive(). + */ + preempt_disable(); complete(done); - schedule(); + schedule_preempt_disabled(); + preempt_enable(); ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 32406ef0d6a2..ac10db66cc63 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -84,12 +84,39 @@ module_param(lock_stat, int, 0644); * to use a raw spinlock - we really dont want the spinlock * code to recurse back into the lockdep code... */ -static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t __lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static struct task_struct *__owner; + +static inline void lockdep_lock(void) +{ + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + + arch_spin_lock(&__lock); + __owner = current; + current->lockdep_recursion++; +} + +static inline void lockdep_unlock(void) +{ + if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current)) + return; + + current->lockdep_recursion--; + __owner = NULL; + arch_spin_unlock(&__lock); +} + +static inline bool lockdep_assert_locked(void) +{ + return DEBUG_LOCKS_WARN_ON(__owner != current); +} + static struct task_struct *lockdep_selftest_task_struct; + static int graph_lock(void) { - arch_spin_lock(&lockdep_lock); + lockdep_lock(); /* * Make sure that if another CPU detected a bug while * walking the graph we dont change it (while the other @@ -97,27 +124,15 @@ static int graph_lock(void) * dropped already) */ if (!debug_locks) { - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); return 0; } - /* prevent any recursions within lockdep from causing deadlocks */ - current->lockdep_recursion++; return 1; } -static inline int graph_unlock(void) +static inline void graph_unlock(void) { - if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) { - /* - * The lockdep graph lock isn't locked while we expect it to - * be, we're confused now, bye! - */ - return DEBUG_LOCKS_WARN_ON(1); - } - - current->lockdep_recursion--; - arch_spin_unlock(&lockdep_lock); - return 0; + lockdep_unlock(); } /* @@ -128,7 +143,7 @@ static inline int debug_locks_off_graph_unlock(void) { int ret = debug_locks_off(); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); return ret; } @@ -147,6 +162,7 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); #define KEYHASH_SIZE (1UL << KEYHASH_BITS) static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; +unsigned long nr_zapped_classes; #ifndef CONFIG_DEBUG_LOCKDEP static #endif @@ -377,18 +393,31 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } +/* + * Split the recrursion counter in two to readily detect 'off' vs recursion. + */ +#define LOCKDEP_RECURSION_BITS 16 +#define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) +#define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) + void lockdep_off(void) { - current->lockdep_recursion++; + current->lockdep_recursion += LOCKDEP_OFF; } EXPORT_SYMBOL(lockdep_off); void lockdep_on(void) { - current->lockdep_recursion--; + current->lockdep_recursion -= LOCKDEP_OFF; } EXPORT_SYMBOL(lockdep_on); +static inline void lockdep_recursion_finish(void) +{ + if (WARN_ON_ONCE(--current->lockdep_recursion)) + current->lockdep_recursion = 0; +} + void lockdep_set_selftest_task(struct task_struct *task) { lockdep_selftest_task_struct = task; @@ -575,6 +604,7 @@ static const char *usage_str[] = #include "lockdep_states.h" #undef LOCKDEP_STATE [LOCK_USED] = "INITIAL USE", + [LOCK_USAGE_STATES] = "IN-NMI", }; #endif @@ -653,7 +683,9 @@ static void print_lock_name(struct lock_class *class) printk(KERN_CONT " ("); __print_lock_name(class); - printk(KERN_CONT "){%s}", usage); + printk(KERN_CONT "){%s}-{%hd:%hd}", usage, + class->wait_type_outer ?: class->wait_type_inner, + class->wait_type_inner); } static void print_lockdep_cache(struct lockdep_map *lock) @@ -787,6 +819,7 @@ static int count_matching_names(struct lock_class *new_class) return count + 1; } +/* used from NMI context -- must be lockless */ static inline struct lock_class * look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) { @@ -1070,13 +1103,15 @@ static inline void check_data_structures(void) { } #endif /* CONFIG_DEBUG_LOCKDEP */ +static void init_chain_block_buckets(void); + /* * Initialize the lock_classes[] array elements, the free_lock_classes list * and also the delayed_free structure. */ static void init_data_structures_once(void) { - static bool ds_initialized, rcu_head_initialized; + static bool __read_mostly ds_initialized, rcu_head_initialized; int i; if (likely(rcu_head_initialized)) @@ -1100,6 +1135,7 @@ static void init_data_structures_once(void) INIT_LIST_HEAD(&lock_classes[i].locks_after); INIT_LIST_HEAD(&lock_classes[i].locks_before); } + init_chain_block_buckets(); } static inline struct hlist_head *keyhashentry(const struct lock_class_key *key) @@ -1230,6 +1266,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) WARN_ON_ONCE(!list_empty(&class->locks_before)); WARN_ON_ONCE(!list_empty(&class->locks_after)); class->name_version = count_matching_names(class); + class->wait_type_inner = lock->wait_type_inner; + class->wait_type_outer = lock->wait_type_outer; /* * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: @@ -1469,6 +1507,8 @@ static int __bfs(struct lock_list *source_entry, struct circular_queue *cq = &lock_cq; int ret = 1; + lockdep_assert_locked(); + if (match(source_entry, data)) { *target_entry = source_entry; ret = 0; @@ -1491,8 +1531,6 @@ static int __bfs(struct lock_list *source_entry, head = get_dep_list(lock, offset); - DEBUG_LOCKS_WARN_ON(!irqs_disabled()); - list_for_each_entry_rcu(entry, head, entry) { if (!lock_accessed(entry)) { unsigned int cq_depth; @@ -1719,9 +1757,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) this.class = class; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); ret = __lockdep_count_forward_deps(&this); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); return ret; @@ -1746,9 +1784,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) this.class = class; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); ret = __lockdep_count_backward_deps(&this); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); return ret; @@ -2298,18 +2336,6 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, return 0; } -static void inc_chains(void) -{ - if (current->hardirq_context) - nr_hardirq_chains++; - else { - if (current->softirq_context) - nr_softirq_chains++; - else - nr_process_chains++; - } -} - #else static inline int check_irq_usage(struct task_struct *curr, @@ -2317,13 +2343,27 @@ static inline int check_irq_usage(struct task_struct *curr, { return 1; } +#endif /* CONFIG_TRACE_IRQFLAGS */ -static inline void inc_chains(void) +static void inc_chains(int irq_context) { - nr_process_chains++; + if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT) + nr_hardirq_chains++; + else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT) + nr_softirq_chains++; + else + nr_process_chains++; } -#endif /* CONFIG_TRACE_IRQFLAGS */ +static void dec_chains(int irq_context) +{ + if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT) + nr_hardirq_chains--; + else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT) + nr_softirq_chains--; + else + nr_process_chains--; +} static void print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) @@ -2622,8 +2662,235 @@ out_bug: struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS); -int nr_chain_hlocks; static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; +unsigned long nr_zapped_lock_chains; +unsigned int nr_free_chain_hlocks; /* Free chain_hlocks in buckets */ +unsigned int nr_lost_chain_hlocks; /* Lost chain_hlocks */ +unsigned int nr_large_chain_blocks; /* size > MAX_CHAIN_BUCKETS */ + +/* + * The first 2 chain_hlocks entries in the chain block in the bucket + * list contains the following meta data: + * + * entry[0]: + * Bit 15 - always set to 1 (it is not a class index) + * Bits 0-14 - upper 15 bits of the next block index + * entry[1] - lower 16 bits of next block index + * + * A next block index of all 1 bits means it is the end of the list. + * + * On the unsized bucket (bucket-0), the 3rd and 4th entries contain + * the chain block size: + * + * entry[2] - upper 16 bits of the chain block size + * entry[3] - lower 16 bits of the chain block size + */ +#define MAX_CHAIN_BUCKETS 16 +#define CHAIN_BLK_FLAG (1U << 15) +#define CHAIN_BLK_LIST_END 0xFFFFU + +static int chain_block_buckets[MAX_CHAIN_BUCKETS]; + +static inline int size_to_bucket(int size) +{ + if (size > MAX_CHAIN_BUCKETS) + return 0; + + return size - 1; +} + +/* + * Iterate all the chain blocks in a bucket. + */ +#define for_each_chain_block(bucket, prev, curr) \ + for ((prev) = -1, (curr) = chain_block_buckets[bucket]; \ + (curr) >= 0; \ + (prev) = (curr), (curr) = chain_block_next(curr)) + +/* + * next block or -1 + */ +static inline int chain_block_next(int offset) +{ + int next = chain_hlocks[offset]; + + WARN_ON_ONCE(!(next & CHAIN_BLK_FLAG)); + + if (next == CHAIN_BLK_LIST_END) + return -1; + + next &= ~CHAIN_BLK_FLAG; + next <<= 16; + next |= chain_hlocks[offset + 1]; + + return next; +} + +/* + * bucket-0 only + */ +static inline int chain_block_size(int offset) +{ + return (chain_hlocks[offset + 2] << 16) | chain_hlocks[offset + 3]; +} + +static inline void init_chain_block(int offset, int next, int bucket, int size) +{ + chain_hlocks[offset] = (next >> 16) | CHAIN_BLK_FLAG; + chain_hlocks[offset + 1] = (u16)next; + + if (size && !bucket) { + chain_hlocks[offset + 2] = size >> 16; + chain_hlocks[offset + 3] = (u16)size; + } +} + +static inline void add_chain_block(int offset, int size) +{ + int bucket = size_to_bucket(size); + int next = chain_block_buckets[bucket]; + int prev, curr; + + if (unlikely(size < 2)) { + /* + * We can't store single entries on the freelist. Leak them. + * + * One possible way out would be to uniquely mark them, other + * than with CHAIN_BLK_FLAG, such that we can recover them when + * the block before it is re-added. + */ + if (size) + nr_lost_chain_hlocks++; + return; + } + + nr_free_chain_hlocks += size; + if (!bucket) { + nr_large_chain_blocks++; + + /* + * Variable sized, sort large to small. + */ + for_each_chain_block(0, prev, curr) { + if (size >= chain_block_size(curr)) + break; + } + init_chain_block(offset, curr, 0, size); + if (prev < 0) + chain_block_buckets[0] = offset; + else + init_chain_block(prev, offset, 0, 0); + return; + } + /* + * Fixed size, add to head. + */ + init_chain_block(offset, next, bucket, size); + chain_block_buckets[bucket] = offset; +} + +/* + * Only the first block in the list can be deleted. + * + * For the variable size bucket[0], the first block (the largest one) is + * returned, broken up and put back into the pool. So if a chain block of + * length > MAX_CHAIN_BUCKETS is ever used and zapped, it will just be + * queued up after the primordial chain block and never be used until the + * hlock entries in the primordial chain block is almost used up. That + * causes fragmentation and reduce allocation efficiency. That can be + * monitored by looking at the "large chain blocks" number in lockdep_stats. + */ +static inline void del_chain_block(int bucket, int size, int next) +{ + nr_free_chain_hlocks -= size; + chain_block_buckets[bucket] = next; + + if (!bucket) + nr_large_chain_blocks--; +} + +static void init_chain_block_buckets(void) +{ + int i; + + for (i = 0; i < MAX_CHAIN_BUCKETS; i++) + chain_block_buckets[i] = -1; + + add_chain_block(0, ARRAY_SIZE(chain_hlocks)); +} + +/* + * Return offset of a chain block of the right size or -1 if not found. + * + * Fairly simple worst-fit allocator with the addition of a number of size + * specific free lists. + */ +static int alloc_chain_hlocks(int req) +{ + int bucket, curr, size; + + /* + * We rely on the MSB to act as an escape bit to denote freelist + * pointers. Make sure this bit isn't set in 'normal' class_idx usage. + */ + BUILD_BUG_ON((MAX_LOCKDEP_KEYS-1) & CHAIN_BLK_FLAG); + + init_data_structures_once(); + + if (nr_free_chain_hlocks < req) + return -1; + + /* + * We require a minimum of 2 (u16) entries to encode a freelist + * 'pointer'. + */ + req = max(req, 2); + bucket = size_to_bucket(req); + curr = chain_block_buckets[bucket]; + + if (bucket) { + if (curr >= 0) { + del_chain_block(bucket, req, chain_block_next(curr)); + return curr; + } + /* Try bucket 0 */ + curr = chain_block_buckets[0]; + } + + /* + * The variable sized freelist is sorted by size; the first entry is + * the largest. Use it if it fits. + */ + if (curr >= 0) { + size = chain_block_size(curr); + if (likely(size >= req)) { + del_chain_block(0, size, chain_block_next(curr)); + add_chain_block(curr + req, size - req); + return curr; + } + } + + /* + * Last resort, split a block in a larger sized bucket. + */ + for (size = MAX_CHAIN_BUCKETS; size > req; size--) { + bucket = size_to_bucket(size); + curr = chain_block_buckets[bucket]; + if (curr < 0) + continue; + + del_chain_block(bucket, size, chain_block_next(curr)); + add_chain_block(curr + req, size - req); + return curr; + } + + return -1; +} + +static inline void free_chain_hlocks(int base, int size) +{ + add_chain_block(base, max(size, 2)); +} struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) { @@ -2803,7 +3070,7 @@ static inline int add_chain_cache(struct task_struct *curr, * disabled to make this an IRQ-safe lock.. for recursion reasons * lockdep won't complain about its own locking errors. */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + if (lockdep_assert_locked()) return 0; chain = alloc_lock_chain(); @@ -2824,15 +3091,8 @@ static inline int add_chain_cache(struct task_struct *curr, BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks)); BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes)); - if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { - chain->base = nr_chain_hlocks; - for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx; - chain_hlocks[chain->base + j] = lock_id; - } - chain_hlocks[chain->base + j] = class - lock_classes; - nr_chain_hlocks += chain->depth; - } else { + j = alloc_chain_hlocks(chain->depth); + if (j < 0) { if (!debug_locks_off_graph_unlock()) return 0; @@ -2841,9 +3101,16 @@ static inline int add_chain_cache(struct task_struct *curr, return 0; } + chain->base = j; + for (j = 0; j < chain->depth - 1; j++, i++) { + int lock_id = curr->held_locks[i].class_idx; + + chain_hlocks[chain->base + j] = lock_id; + } + chain_hlocks[chain->base + j] = class - lock_classes; hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); - inc_chains(); + inc_chains(chain->irq_context); return 1; } @@ -2987,6 +3254,8 @@ static inline int validate_chain(struct task_struct *curr, { return 1; } + +static void init_chain_block_buckets(void) { } #endif /* CONFIG_PROVE_LOCKING */ /* @@ -3081,10 +3350,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), - trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, - trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - trace_hardirqs_enabled(curr), - trace_softirqs_enabled(curr)); + lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, + lockdep_hardirqs_enabled(curr), + lockdep_softirqs_enabled(curr)); print_lock(this); pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); @@ -3429,9 +3698,9 @@ void lockdep_hardirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) return; - current->lockdep_recursion = 1; + current->lockdep_recursion++; __trace_hardirqs_on_caller(ip); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); } NOKPROBE_SYMBOL(lockdep_hardirqs_on); @@ -3468,7 +3737,7 @@ NOKPROBE_SYMBOL(lockdep_hardirqs_off); /* * Softirqs will be enabled: */ -void trace_softirqs_on(unsigned long ip) +void lockdep_softirqs_on(unsigned long ip) { struct task_struct *curr = current; @@ -3487,7 +3756,7 @@ void trace_softirqs_on(unsigned long ip) return; } - current->lockdep_recursion = 1; + current->lockdep_recursion++; /* * We'll do an OFF -> ON transition: */ @@ -3502,13 +3771,13 @@ void trace_softirqs_on(unsigned long ip) */ if (curr->hardirqs_enabled) mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); } /* * Softirqs were disabled: */ -void trace_softirqs_off(unsigned long ip) +void lockdep_softirqs_off(unsigned long ip) { struct task_struct *curr = current; @@ -3596,7 +3865,8 @@ lock_used: static inline unsigned int task_irq_context(struct task_struct *task) { - return 2 * !!task->hardirq_context + !!task->softirq_context; + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context + + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } static int separate_irq_context(struct task_struct *curr, @@ -3682,6 +3952,124 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return ret; } +static inline short task_wait_context(struct task_struct *curr) +{ + /* + * Set appropriate wait type for the context; for IRQs we have to take + * into account force_irqthread as that is implied by PREEMPT_RT. + */ + if (curr->hardirq_context) { + /* + * Check if force_irqthreads will run us threaded. + */ + if (curr->hardirq_threaded || curr->irq_config) + return LD_WAIT_CONFIG; + + return LD_WAIT_SPIN; + } else if (curr->softirq_context) { + /* + * Softirqs are always threaded. + */ + return LD_WAIT_CONFIG; + } + + return LD_WAIT_MAX; +} + +static int +print_lock_invalid_wait_context(struct task_struct *curr, + struct held_lock *hlock) +{ + short curr_inner; + + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + pr_warn("\n"); + pr_warn("=============================\n"); + pr_warn("[ BUG: Invalid wait context ]\n"); + print_kernel_ident(); + pr_warn("-----------------------------\n"); + + pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); + print_lock(hlock); + + pr_warn("other info that might help us debug this:\n"); + + curr_inner = task_wait_context(curr); + pr_warn("context-{%d:%d}\n", curr_inner, curr_inner); + + lockdep_print_held_locks(curr); + + pr_warn("stack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Verify the wait_type context. + * + * This check validates we takes locks in the right wait-type order; that is it + * ensures that we do not take mutexes inside spinlocks and do not attempt to + * acquire spinlocks inside raw_spinlocks and the sort. + * + * The entire thing is slightly more complex because of RCU, RCU is a lock that + * can be taken from (pretty much) any context but also has constraints. + * However when taken in a stricter environment the RCU lock does not loosen + * the constraints. + * + * Therefore we must look for the strictest environment in the lock stack and + * compare that to the lock we're trying to acquire. + */ +static int check_wait_context(struct task_struct *curr, struct held_lock *next) +{ + short next_inner = hlock_class(next)->wait_type_inner; + short next_outer = hlock_class(next)->wait_type_outer; + short curr_inner; + int depth; + + if (!curr->lockdep_depth || !next_inner || next->trylock) + return 0; + + if (!next_outer) + next_outer = next_inner; + + /* + * Find start of current irq_context.. + */ + for (depth = curr->lockdep_depth - 1; depth >= 0; depth--) { + struct held_lock *prev = curr->held_locks + depth; + if (prev->irq_context != next->irq_context) + break; + } + depth++; + + curr_inner = task_wait_context(curr); + + for (; depth < curr->lockdep_depth; depth++) { + struct held_lock *prev = curr->held_locks + depth; + short prev_inner = hlock_class(prev)->wait_type_inner; + + if (prev_inner) { + /* + * We can have a bigger inner than a previous one + * when outer is smaller than inner, as with RCU. + * + * Also due to trylocks. + */ + curr_inner = min(curr_inner, prev_inner); + } + } + + if (next_outer > curr_inner) + return print_lock_invalid_wait_context(curr, next); + + return 0; +} + #else /* CONFIG_PROVE_LOCKING */ static inline int @@ -3701,13 +4089,20 @@ static inline int separate_irq_context(struct task_struct *curr, return 0; } +static inline int check_wait_context(struct task_struct *curr, + struct held_lock *next) +{ + return 0; +} + #endif /* CONFIG_PROVE_LOCKING */ /* * Initialize a lock instance's lock-class mapping info: */ -void lockdep_init_map(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) +void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass, + short inner, short outer) { int i; @@ -3728,6 +4123,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, lock->name = name; + lock->wait_type_outer = outer; + lock->wait_type_inner = inner; + /* * No key, no joy, we need to hash something. */ @@ -3755,13 +4153,13 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, return; raw_local_irq_save(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; register_lock_class(lock, subclass, 1); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } } -EXPORT_SYMBOL_GPL(lockdep_init_map); +EXPORT_SYMBOL_GPL(lockdep_init_map_waits); struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); @@ -3862,7 +4260,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes; - if (depth) { + if (depth) { /* we're holding locks */ hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (!references) @@ -3904,6 +4302,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif hlock->pin_count = pin_count; + if (check_wait_context(curr, hlock)) + return 0; + /* Initialize the lock usage bit */ if (!mark_usage(curr, hlock, check)) return 0; @@ -4139,7 +4540,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name, return 0; } - lockdep_init_map(lock, name, key, 0); + lockdep_init_map_waits(lock, name, key, 0, + lock->wait_type_inner, + lock->wait_type_outer); class = register_lock_class(lock, subclass, 0); hlock->class_idx = class - lock_classes; @@ -4437,11 +4840,11 @@ void lock_set_class(struct lockdep_map *lock, const char *name, return; raw_local_irq_save(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; check_flags(flags); if (__lock_set_class(lock, name, key, subclass, ip)) check_chain_key(current); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_set_class); @@ -4454,15 +4857,45 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip) return; raw_local_irq_save(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; check_flags(flags); if (__lock_downgrade(lock, ip)) check_chain_key(current); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_downgrade); +/* NMI context !!! */ +static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock, int subclass) +{ +#ifdef CONFIG_PROVE_LOCKING + struct lock_class *class = look_up_lock_class(lock, subclass); + + /* if it doesn't have a class (yet), it certainly hasn't been used yet */ + if (!class) + return; + + if (!(class->usage_mask & LOCK_USED)) + return; + + hlock->class_idx = class - lock_classes; + + print_usage_bug(current, hlock, LOCK_USED, LOCK_USAGE_STATES); +#endif +} + +static bool lockdep_nmi(void) +{ + if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) + return false; + + if (!in_nmi()) + return false; + + return true; +} + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -4473,17 +4906,34 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(current->lockdep_recursion)) { + /* XXX allow trylock from NMI ?!? */ + if (lockdep_nmi() && !trylock) { + struct held_lock hlock; + + hlock.acquire_ip = ip; + hlock.instance = lock; + hlock.nest_lock = nest_lock; + hlock.irq_context = 2; // XXX + hlock.trylock = trylock; + hlock.read = read; + hlock.check = check; + hlock.hardirqs_off = true; + hlock.references = 0; + + verify_lock_unused(lock, &hlock, subclass); + } return; + } raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), nest_lock, ip, 0, 0); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_acquire); @@ -4497,11 +4947,11 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; trace_lock_release(lock, ip); if (__lock_release(lock, ip)) check_chain_key(current); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_release); @@ -4517,9 +4967,9 @@ int lock_is_held_type(const struct lockdep_map *lock, int read) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; ret = __lock_is_held(lock, read); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); return ret; @@ -4538,9 +4988,9 @@ struct pin_cookie lock_pin_lock(struct lockdep_map *lock) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; cookie = __lock_pin_lock(lock); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); return cookie; @@ -4557,9 +5007,9 @@ void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; __lock_repin_lock(lock, cookie); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_repin_lock); @@ -4574,9 +5024,9 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; __lock_unpin_lock(lock, cookie); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_unpin_lock); @@ -4712,10 +5162,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; trace_lock_contended(lock, ip); __lock_contended(lock, ip); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_contended); @@ -4732,9 +5182,9 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; __lock_acquired(lock, ip); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_acquired); @@ -4768,57 +5218,33 @@ static void remove_class_from_lock_chain(struct pending_free *pf, struct lock_class *class) { #ifdef CONFIG_PROVE_LOCKING - struct lock_chain *new_chain; - u64 chain_key; int i; for (i = chain->base; i < chain->base + chain->depth; i++) { if (chain_hlocks[i] != class - lock_classes) continue; - /* The code below leaks one chain_hlock[] entry. */ - if (--chain->depth > 0) { - memmove(&chain_hlocks[i], &chain_hlocks[i + 1], - (chain->base + chain->depth - i) * - sizeof(chain_hlocks[0])); - } /* * Each lock class occurs at most once in a lock chain so once * we found a match we can break out of this loop. */ - goto recalc; + goto free_lock_chain; } /* Since the chain has not been modified, return. */ return; -recalc: - chain_key = INITIAL_CHAIN_KEY; - for (i = chain->base; i < chain->base + chain->depth; i++) - chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); - if (chain->depth && chain->chain_key == chain_key) - return; +free_lock_chain: + free_chain_hlocks(chain->base, chain->depth); /* Overwrite the chain key for concurrent RCU readers. */ - WRITE_ONCE(chain->chain_key, chain_key); + WRITE_ONCE(chain->chain_key, INITIAL_CHAIN_KEY); + dec_chains(chain->irq_context); + /* * Note: calling hlist_del_rcu() from inside a * hlist_for_each_entry_rcu() loop is safe. */ hlist_del_rcu(&chain->entry); __set_bit(chain - lock_chains, pf->lock_chains_being_freed); - if (chain->depth == 0) - return; - /* - * If the modified lock chain matches an existing lock chain, drop - * the modified lock chain. - */ - if (lookup_chain_cache(chain_key)) - return; - new_chain = alloc_lock_chain(); - if (WARN_ON_ONCE(!new_chain)) { - debug_locks_off(); - return; - } - *new_chain = *chain; - hlist_add_head_rcu(&new_chain->entry, chainhashentry(chain_key)); + nr_zapped_lock_chains++; #endif } @@ -4874,6 +5300,7 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) } remove_class_from_lock_chains(pf, class); + nr_zapped_classes++; } static void reinit_class(struct lock_class *class) @@ -4958,8 +5385,7 @@ static void free_zapped_rcu(struct rcu_head *ch) return; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); - current->lockdep_recursion = 1; + lockdep_lock(); /* closed head */ pf = delayed_free.pf + (delayed_free.index ^ 1); @@ -4971,8 +5397,7 @@ static void free_zapped_rcu(struct rcu_head *ch) */ call_rcu_zapped(delayed_free.pf + delayed_free.index); - current->lockdep_recursion = 0; - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); } @@ -5017,13 +5442,11 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size) init_data_structures_once(); raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); - current->lockdep_recursion = 1; + lockdep_lock(); pf = get_pending_free(); __lockdep_free_key_range(pf, start, size); call_rcu_zapped(pf); - current->lockdep_recursion = 0; - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); /* @@ -5045,10 +5468,10 @@ static void lockdep_free_key_range_imm(void *start, unsigned long size) init_data_structures_once(); raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); __lockdep_free_key_range(pf, start, size); __free_zapped_classes(pf); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); } @@ -5144,10 +5567,10 @@ static void lockdep_reset_lock_imm(struct lockdep_map *lock) unsigned long flags; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); __lockdep_reset_lock(pf, lock); __free_zapped_classes(pf); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); } diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 18d85aebbb57..baca699b94e9 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -106,6 +106,12 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define STACK_TRACE_HASH_SIZE 16384 #endif +/* + * Bit definitions for lock_chain.irq_context + */ +#define LOCK_CHAIN_SOFTIRQ_CONTEXT (1 << 0) +#define LOCK_CHAIN_HARDIRQ_CONTEXT (1 << 1) + #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) @@ -124,17 +130,21 @@ extern const char *__get_key_name(const struct lockdep_subclass_key *key, struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); extern unsigned long nr_lock_classes; +extern unsigned long nr_zapped_classes; +extern unsigned long nr_zapped_lock_chains; extern unsigned long nr_list_entries; long lockdep_next_lockchain(long i); unsigned long lock_chain_count(void); -extern int nr_chain_hlocks; extern unsigned long nr_stack_trace_entries; extern unsigned int nr_hardirq_chains; extern unsigned int nr_softirq_chains; extern unsigned int nr_process_chains; -extern unsigned int max_lockdep_depth; +extern unsigned int nr_free_chain_hlocks; +extern unsigned int nr_lost_chain_hlocks; +extern unsigned int nr_large_chain_blocks; +extern unsigned int max_lockdep_depth; extern unsigned int max_bfs_queue_depth; #ifdef CONFIG_PROVE_LOCKING diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 231684cfc5ae..5525cd3ba0c8 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -128,15 +128,22 @@ static int lc_show(struct seq_file *m, void *v) struct lock_chain *chain = v; struct lock_class *class; int i; + static const char * const irq_strs[] = { + [0] = "0", + [LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq", + [LOCK_CHAIN_SOFTIRQ_CONTEXT] = "softirq", + [LOCK_CHAIN_SOFTIRQ_CONTEXT| + LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq|softirq", + }; if (v == SEQ_START_TOKEN) { - if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS) + if (!nr_free_chain_hlocks) seq_printf(m, "(buggered) "); seq_printf(m, "all lock chains:\n"); return 0; } - seq_printf(m, "irq_context: %d\n", chain->irq_context); + seq_printf(m, "irq_context: %s\n", irq_strs[chain->irq_context]); for (i = 0; i < chain->depth; i++) { class = lock_chain_get_class(chain, i); @@ -271,8 +278,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", lock_chain_count(), MAX_LOCKDEP_CHAINS); - seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", - nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); + seq_printf(m, " dependency chain hlocks used: %11lu [max: %lu]\n", + MAX_LOCKDEP_CHAIN_HLOCKS - + (nr_free_chain_hlocks + nr_lost_chain_hlocks), + MAX_LOCKDEP_CHAIN_HLOCKS); + seq_printf(m, " dependency chain hlocks lost: %11u\n", + nr_lost_chain_hlocks); #endif #ifdef CONFIG_TRACE_IRQFLAGS @@ -336,6 +347,18 @@ static int lockdep_stats_show(struct seq_file *m, void *v) seq_printf(m, " debug_locks: %11u\n", debug_locks); + /* + * Zappped classes and lockdep data buffers reuse statistics. + */ + seq_puts(m, "\n"); + seq_printf(m, " zapped classes: %11lu\n", + nr_zapped_classes); +#ifdef CONFIG_PROVE_LOCKING + seq_printf(m, " zapped lock chains: %11lu\n", + nr_zapped_lock_chains); + seq_printf(m, " large chain blocks: %11u\n", + nr_large_chain_blocks); +#endif return 0; } diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 771d4ca96dda..a7276aaf2abc 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -85,7 +85,7 @@ void debug_mutex_init(struct mutex *lock, const char *name, * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); #endif lock->magic = lock; } diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 364d38a0c444..8bbafe3e5203 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -1,27 +1,29 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/atomic.h> -#include <linux/rwsem.h> #include <linux/percpu.h> +#include <linux/wait.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/sched/task.h> #include <linux/errno.h> -#include "rwsem.h" - int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, - const char *name, struct lock_class_key *rwsem_key) + const char *name, struct lock_class_key *key) { sem->read_count = alloc_percpu(int); if (unlikely(!sem->read_count)) return -ENOMEM; - /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ rcu_sync_init(&sem->rss); - __init_rwsem(&sem->rw_sem, name, rwsem_key); rcuwait_init(&sem->writer); - sem->readers_block = 0; + init_waitqueue_head(&sem->waiters); + atomic_set(&sem->block, 0); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif return 0; } EXPORT_SYMBOL_GPL(__percpu_init_rwsem); @@ -41,73 +43,140 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *sem) } EXPORT_SYMBOL_GPL(percpu_free_rwsem); -int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) +static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { + __this_cpu_inc(*sem->read_count); + /* * Due to having preemption disabled the decrement happens on * the same CPU as the increment, avoiding the * increment-on-one-CPU-and-decrement-on-another problem. * - * If the reader misses the writer's assignment of readers_block, then - * the writer is guaranteed to see the reader's increment. + * If the reader misses the writer's assignment of sem->block, then the + * writer is guaranteed to see the reader's increment. * * Conversely, any readers that increment their sem->read_count after - * the writer looks are guaranteed to see the readers_block value, - * which in turn means that they are guaranteed to immediately - * decrement their sem->read_count, so that it doesn't matter that the - * writer missed them. + * the writer looks are guaranteed to see the sem->block value, which + * in turn means that they are guaranteed to immediately decrement + * their sem->read_count, so that it doesn't matter that the writer + * missed them. */ smp_mb(); /* A matches D */ /* - * If !readers_block the critical section starts here, matched by the + * If !sem->block the critical section starts here, matched by the * release in percpu_up_write(). */ - if (likely(!smp_load_acquire(&sem->readers_block))) + if (likely(!atomic_read_acquire(&sem->block))) + return true; + + __this_cpu_dec(*sem->read_count); + + /* Prod writer to re-evaluate readers_active_check() */ + rcuwait_wake_up(&sem->writer); + + return false; +} + +static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem) +{ + if (atomic_read(&sem->block)) + return false; + + return atomic_xchg(&sem->block, 1) == 0; +} + +static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader) +{ + if (reader) { + bool ret; + + preempt_disable(); + ret = __percpu_down_read_trylock(sem); + preempt_enable(); + + return ret; + } + return __percpu_down_write_trylock(sem); +} + +/* + * The return value of wait_queue_entry::func means: + * + * <0 - error, wakeup is terminated and the error is returned + * 0 - no wakeup, a next waiter is tried + * >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive. + * + * We use EXCLUSIVE for both readers and writers to preserve FIFO order, + * and play games with the return value to allow waking multiple readers. + * + * Specifically, we wake readers until we've woken a single writer, or until a + * trylock fails. + */ +static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, + unsigned int mode, int wake_flags, + void *key) +{ + bool reader = wq_entry->flags & WQ_FLAG_CUSTOM; + struct percpu_rw_semaphore *sem = key; + struct task_struct *p; + + /* concurrent against percpu_down_write(), can get stolen */ + if (!__percpu_rwsem_trylock(sem, reader)) return 1; - /* - * Per the above comment; we still have preemption disabled and - * will thus decrement on the same CPU as we incremented. - */ - __percpu_up_read(sem); + p = get_task_struct(wq_entry->private); + list_del_init(&wq_entry->entry); + smp_store_release(&wq_entry->private, NULL); - if (try) - return 0; + wake_up_process(p); + put_task_struct(p); - /* - * We either call schedule() in the wait, or we'll fall through - * and reschedule on the preempt_enable() in percpu_down_read(). - */ - preempt_enable_no_resched(); + return !reader; /* wake (readers until) 1 writer */ +} + +static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) +{ + DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); + bool wait; + spin_lock_irq(&sem->waiters.lock); /* - * Avoid lockdep for the down/up_read() we already have them. + * Serialize against the wakeup in percpu_up_write(), if we fail + * the trylock, the wakeup must see us on the list. */ - __down_read(&sem->rw_sem); - this_cpu_inc(*sem->read_count); - __up_read(&sem->rw_sem); + wait = !__percpu_rwsem_trylock(sem, reader); + if (wait) { + wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM; + __add_wait_queue_entry_tail(&sem->waiters, &wq_entry); + } + spin_unlock_irq(&sem->waiters.lock); - preempt_disable(); - return 1; + while (wait) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!smp_load_acquire(&wq_entry.private)) + break; + schedule(); + } + __set_current_state(TASK_RUNNING); } -EXPORT_SYMBOL_GPL(__percpu_down_read); -void __percpu_up_read(struct percpu_rw_semaphore *sem) +bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) { - smp_mb(); /* B matches C */ - /* - * In other words, if they see our decrement (presumably to aggregate - * zero, as that is the only time it matters) they will also see our - * critical section. - */ - __this_cpu_dec(*sem->read_count); + if (__percpu_down_read_trylock(sem)) + return true; - /* Prod writer to recheck readers_active */ - rcuwait_wake_up(&sem->writer); + if (try) + return false; + + preempt_enable(); + percpu_rwsem_wait(sem, /* .reader = */ true); + preempt_disable(); + + return true; } -EXPORT_SYMBOL_GPL(__percpu_up_read); +EXPORT_SYMBOL_GPL(__percpu_down_read); #define per_cpu_sum(var) \ ({ \ @@ -124,6 +193,8 @@ EXPORT_SYMBOL_GPL(__percpu_up_read); * zero. If this sum is zero, then it is stable due to the fact that if any * newly arriving readers increment a given counter, they will immediately * decrement that same counter. + * + * Assumes sem->block is set. */ static bool readers_active_check(struct percpu_rw_semaphore *sem) { @@ -142,32 +213,36 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem) void percpu_down_write(struct percpu_rw_semaphore *sem) { + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + /* Notify readers to take the slow path. */ rcu_sync_enter(&sem->rss); - down_write(&sem->rw_sem); - /* - * Notify new readers to block; up until now, and thus throughout the - * longish rcu_sync_enter() above, new readers could still come in. + * Try set sem->block; this provides writer-writer exclusion. + * Having sem->block set makes new readers block. */ - WRITE_ONCE(sem->readers_block, 1); + if (!__percpu_down_write_trylock(sem)) + percpu_rwsem_wait(sem, /* .reader = */ false); - smp_mb(); /* D matches A */ + /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */ /* - * If they don't see our writer of readers_block, then we are - * guaranteed to see their sem->read_count increment, and therefore - * will wait for them. + * If they don't see our store of sem->block, then we are guaranteed to + * see their sem->read_count increment, and therefore will wait for + * them. */ - /* Wait for all now active readers to complete. */ - rcuwait_wait_event(&sem->writer, readers_active_check(sem)); + /* Wait for all active readers to complete. */ + rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL_GPL(percpu_down_write); void percpu_up_write(struct percpu_rw_semaphore *sem) { + rwsem_release(&sem->dep_map, _RET_IP_); + /* * Signal the writer is done, no fast path yet. * @@ -178,12 +253,12 @@ void percpu_up_write(struct percpu_rw_semaphore *sem) * Therefore we force it through the slow path which guarantees an * acquire and thereby guarantees the critical section's consistency. */ - smp_store_release(&sem->readers_block, 0); + atomic_set_release(&sem->block, 0); /* - * Release the write lock, this will allow readers back in the game. + * Prod any pending reader/writer to make progress. */ - up_write(&sem->rw_sem); + __wake_up(&sem->waiters, TASK_NORMAL, 1, sem); /* * Once this completes (at least one RCU-sched grace period hence) the diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 0d9b6be9ecc8..f11b9bd3431d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -28,7 +28,6 @@ #include <linux/rwsem.h> #include <linux/atomic.h> -#include "rwsem.h" #include "lock_events.h" /* @@ -329,7 +328,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, * Make sure we are not reinitializing a held semaphore: */ debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); + lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); #endif #ifdef CONFIG_DEBUG_RWSEMS sem->magic = sem; @@ -660,8 +659,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, unsigned long flags; bool ret = true; - BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE)); - if (need_resched()) { lockevent_inc(rwsem_opt_fail); return false; @@ -1338,7 +1335,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) /* * lock for reading */ -inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct rw_semaphore *sem) { if (!rwsem_read_trylock(sem)) { rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); @@ -1426,7 +1423,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* * unlock after reading */ -inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_semaphore *sem) { long tmp; diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 2534ce49f648..e69de29bb2d1 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef __INTERNAL_RWSEM_H -#define __INTERNAL_RWSEM_H -#include <linux/rwsem.h> - -extern void __down_read(struct rw_semaphore *sem); -extern void __up_read(struct rw_semaphore *sem); - -#endif /* __INTERNAL_RWSEM_H */ diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 472dd462a40c..b9d93087ee66 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -14,14 +14,14 @@ #include <linux/export.h> void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, - struct lock_class_key *key) + struct lock_class_key *key, short inner) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, inner); #endif lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; lock->magic = SPINLOCK_MAGIC; @@ -39,7 +39,7 @@ void __rwlock_init(rwlock_t *lock, const char *name, * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG); #endif lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; lock->magic = RWLOCK_MAGIC; diff --git a/kernel/module.c b/kernel/module.c index 33569a01d6e1..646f1e2330d2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1515,7 +1515,7 @@ struct module_sect_attr { struct module_sect_attrs { struct attribute_group grp; unsigned int nsections; - struct module_sect_attr attrs[0]; + struct module_sect_attr attrs[]; }; static ssize_t module_sect_show(struct module_attribute *mattr, @@ -1608,7 +1608,7 @@ static void remove_sect_attrs(struct module *mod) struct module_notes_attrs { struct kobject *dir; unsigned int notes; - struct bin_attribute attrs[0]; + struct bin_attribute attrs[]; }; static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, @@ -4355,6 +4355,7 @@ static int modules_open(struct inode *inode, struct file *file) } static const struct proc_ops modules_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = modules_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/kernel/padata.c b/kernel/padata.c index 72777c10bb9c..a6afa12fb75e 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -512,7 +512,7 @@ static int padata_replace_one(struct padata_shell *ps) static int padata_replace(struct padata_instance *pinst) { struct padata_shell *ps; - int err; + int err = 0; pinst->flags |= PADATA_RESET; @@ -1038,12 +1038,13 @@ EXPORT_SYMBOL(padata_alloc_shell); */ void padata_free_shell(struct padata_shell *ps) { - struct padata_instance *pinst = ps->pinst; + if (!ps) + return; - mutex_lock(&pinst->lock); + mutex_lock(&ps->pinst->lock); list_del(&ps->list); padata_free_pd(rcu_dereference_protected(ps->pd, 1)); - mutex_unlock(&pinst->lock); + mutex_unlock(&ps->pinst->lock); kfree(ps); } diff --git a/kernel/pid.c b/kernel/pid.c index 647b4bb457b5..c835b844aca7 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -144,9 +144,6 @@ void free_pid(struct pid *pid) /* Handle a fork failure of the first process */ WARN_ON(ns->child_reaper); ns->pid_allocated = 0; - /* fall through */ - case 0: - schedule_work(&ns->proc_work); break; } @@ -257,17 +254,14 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, */ retval = -ENOMEM; - if (unlikely(is_child_reaper(pid))) { - if (pid_ns_prepare_proc(ns)) - goto out_free; - } - get_pid_ns(ns); refcount_set(&pid->count, 1); + spin_lock_init(&pid->lock); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); init_waitqueue_head(&pid->wait_pidfd); + INIT_HLIST_HEAD(&pid->inodes); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); @@ -594,7 +588,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd) struct file *file; int ret; - ret = mutex_lock_killable(&task->signal->cred_guard_mutex); + ret = mutex_lock_killable(&task->signal->exec_update_mutex); if (ret) return ERR_PTR(ret); @@ -603,7 +597,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd) else file = ERR_PTR(-EPERM); - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); return file ?: ERR_PTR(-EBADF); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index d40017e79ebe..01f8ba32cc0c 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -57,12 +57,6 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) return READ_ONCE(*pkc); } -static void proc_cleanup_work(struct work_struct *work) -{ - struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); - pid_ns_release_proc(ns); -} - static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); @@ -114,7 +108,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; - INIT_WORK(&ns->proc_work, proc_cleanup_work); return ns; @@ -231,20 +224,27 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) } while (rc != -ECHILD); /* - * kernel_wait4() above can't reap the EXIT_DEAD children but we do not - * really care, we could reparent them to the global init. We could - * exit and reap ->child_reaper even if it is not the last thread in - * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), - * pid_ns can not go away until proc_kill_sb() drops the reference. + * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE + * process whose parents processes are outside of the pid + * namespace. Such processes are created with setns()+fork(). + * + * If those EXIT_ZOMBIE processes are not reaped by their + * parents before their parents exit, they will be reparented + * to pid_ns->child_reaper. Thus pidns->child_reaper needs to + * stay valid until they all go away. + * + * The code relies on the the pid_ns->child_reaper ignoring + * SIGCHILD to cause those EXIT_ZOMBIE processes to be + * autoreaped if reparented. * - * But this ns can also have other tasks injected by setns()+fork(). - * Again, ignoring the user visible semantics we do not really need - * to wait until they are all reaped, but they can be reparented to - * us and thus we need to ensure that pid->child_reaper stays valid - * until they all go away. See free_pid()->wake_up_process(). + * Semantically it is also desirable to wait for EXIT_ZOMBIE + * processes before allowing the child_reaper to be reaped, as + * that gives the invariant that when the init process of a + * pid namespace is reaped all of the processes in the pid + * namespace are gone. * - * We rely on ignored SIGCHLD, an injected zombie must be autoreaped - * if reparented. + * Once all of the other tasks are gone from the pid_namespace + * free_pid() will awaken this task. */ for (;;) { set_current_state(TASK_INTERRUPTIBLE); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 7cbfbeacd68a..c208566c844b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -80,9 +80,6 @@ config HIBERNATION For more information take a look at <file:Documentation/power/swsusp.rst>. -config ARCH_SAVE_PAGE_KEYS - bool - config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 6dbeedb7354c..86aba8706b16 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -678,7 +678,7 @@ static int load_image_and_restore(void) error = swsusp_read(&flags); swsusp_close(FMODE_READ); if (!error) - hibernation_restore(flags & SF_PLATFORM_MODE); + error = hibernation_restore(flags & SF_PLATFORM_MODE); pr_err("Failed to load image, recovering.\n"); swsusp_free(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 69b7a8aeca3b..40f86ec4ab30 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -535,6 +535,13 @@ static ssize_t pm_debug_messages_store(struct kobject *kobj, power_attr(pm_debug_messages); +static int __init pm_debug_messages_setup(char *str) +{ + pm_debug_messages_on = true; + return 1; +} +__setup("pm_debug_messages", pm_debug_messages_setup); + /** * __pm_pr_dbg - Print a suspend debug message to the kernel log. * @defer: Whether or not to use printk_deferred() to print the message. diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 83edf8698118..db0bed2cae26 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -1,31 +1,21 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * This module exposes the interface to kernel space for specifying - * QoS dependencies. It provides infrastructure for registration of: + * Power Management Quality of Service (PM QoS) support base. * - * Dependents on a QoS value : register requests - * Watchers of QoS value : get notified when target QoS value changes + * Copyright (C) 2020 Intel Corporation * - * This QoS design is best effort based. Dependents register their QoS needs. - * Watchers register to keep track of the current QoS needs of the system. + * Authors: + * Mark Gross <mgross@linux.intel.com> + * Rafael J. Wysocki <rafael.j.wysocki@intel.com> * - * There are 3 basic classes of QoS parameter: latency, timeout, throughput - * each have defined units: - * latency: usec - * timeout: usec <-- currently not used. - * throughput: kbs (kilo byte / sec) + * Provided here is an interface for specifying PM QoS dependencies. It allows + * entities depending on QoS constraints to register their requests which are + * aggregated as appropriate to produce effective constraints (target values) + * that can be monitored by entities needing to respect them, either by polling + * or through a built-in notification mechanism. * - * There are lists of pm_qos_objects each one wrapping requests, notifiers - * - * User mode requests on a QOS parameter register themselves to the - * subsystem by opening the device node /dev/... and writing there request to - * the node. As long as the process holds a file handle open to the node the - * client continues to be accounted for. Upon file release the usermode - * request is removed and a new qos target is computed. This way when the - * request that the application has is cleaned up when closes the file - * pointer or exits the pm_qos_object will get an opportunity to clean up. - * - * Mark Gross <mgross@linux.intel.com> + * In addition to the basic functionality, more specific interfaces for managing + * global CPU latency QoS requests and frequency QoS requests are provided. */ /*#define DEBUG*/ @@ -54,56 +44,19 @@ * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ -struct pm_qos_object { - struct pm_qos_constraints *constraints; - struct miscdevice pm_qos_power_miscdev; - char *name; -}; - static DEFINE_SPINLOCK(pm_qos_lock); -static struct pm_qos_object null_pm_qos; - -static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); -static struct pm_qos_constraints cpu_dma_constraints = { - .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), - .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, - .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, - .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, - .type = PM_QOS_MIN, - .notifiers = &cpu_dma_lat_notifier, -}; -static struct pm_qos_object cpu_dma_pm_qos = { - .constraints = &cpu_dma_constraints, - .name = "cpu_dma_latency", -}; - -static struct pm_qos_object *pm_qos_array[] = { - &null_pm_qos, - &cpu_dma_pm_qos, -}; - -static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, - size_t count, loff_t *f_pos); -static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, - size_t count, loff_t *f_pos); -static int pm_qos_power_open(struct inode *inode, struct file *filp); -static int pm_qos_power_release(struct inode *inode, struct file *filp); - -static const struct file_operations pm_qos_power_fops = { - .write = pm_qos_power_write, - .read = pm_qos_power_read, - .open = pm_qos_power_open, - .release = pm_qos_power_release, - .llseek = noop_llseek, -}; - -/* unlocked internal variant */ -static inline int pm_qos_get_value(struct pm_qos_constraints *c) +/** + * pm_qos_read_value - Return the current effective constraint value. + * @c: List of PM QoS constraint requests. + */ +s32 pm_qos_read_value(struct pm_qos_constraints *c) { - struct plist_node *node; - int total_value = 0; + return READ_ONCE(c->target_value); +} +static int pm_qos_get_value(struct pm_qos_constraints *c) +{ if (plist_head_empty(&c->list)) return c->no_constraint_value; @@ -114,111 +67,42 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) case PM_QOS_MAX: return plist_last(&c->list)->prio; - case PM_QOS_SUM: - plist_for_each(node, &c->list) - total_value += node->prio; - - return total_value; - default: - /* runtime check for not using enum */ - BUG(); + WARN(1, "Unknown PM QoS type in %s\n", __func__); return PM_QOS_DEFAULT_VALUE; } } -s32 pm_qos_read_value(struct pm_qos_constraints *c) -{ - return c->target_value; -} - -static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) +static void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) { - c->target_value = value; + WRITE_ONCE(c->target_value, value); } -static int pm_qos_debug_show(struct seq_file *s, void *unused) -{ - struct pm_qos_object *qos = (struct pm_qos_object *)s->private; - struct pm_qos_constraints *c; - struct pm_qos_request *req; - char *type; - unsigned long flags; - int tot_reqs = 0; - int active_reqs = 0; - - if (IS_ERR_OR_NULL(qos)) { - pr_err("%s: bad qos param!\n", __func__); - return -EINVAL; - } - c = qos->constraints; - if (IS_ERR_OR_NULL(c)) { - pr_err("%s: Bad constraints on qos?\n", __func__); - return -EINVAL; - } - - /* Lock to ensure we have a snapshot */ - spin_lock_irqsave(&pm_qos_lock, flags); - if (plist_head_empty(&c->list)) { - seq_puts(s, "Empty!\n"); - goto out; - } - - switch (c->type) { - case PM_QOS_MIN: - type = "Minimum"; - break; - case PM_QOS_MAX: - type = "Maximum"; - break; - case PM_QOS_SUM: - type = "Sum"; - break; - default: - type = "Unknown"; - } - - plist_for_each_entry(req, &c->list, node) { - char *state = "Default"; - - if ((req->node).prio != c->default_value) { - active_reqs++; - state = "Active"; - } - tot_reqs++; - seq_printf(s, "%d: %d: %s\n", tot_reqs, - (req->node).prio, state); - } - - seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n", - type, pm_qos_get_value(c), active_reqs, tot_reqs); - -out: - spin_unlock_irqrestore(&pm_qos_lock, flags); - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(pm_qos_debug); - /** - * pm_qos_update_target - manages the constraints list and calls the notifiers - * if needed - * @c: constraints data struct - * @node: request to add to the list, to update or to remove - * @action: action to take on the constraints list - * @value: value of the request to add or update + * pm_qos_update_target - Update a list of PM QoS constraint requests. + * @c: List of PM QoS requests. + * @node: Target list entry. + * @action: Action to carry out (add, update or remove). + * @value: New request value for the target list entry. * - * This function returns 1 if the aggregated constraint value has changed, 0 - * otherwise. + * Update the given list of PM QoS constraint requests, @c, by carrying an + * @action involving the @node list entry and @value on it. + * + * The recognized values of @action are PM_QOS_ADD_REQ (store @value in @node + * and add it to the list), PM_QOS_UPDATE_REQ (remove @node from the list, store + * @value in it and add it to the list again), and PM_QOS_REMOVE_REQ (remove + * @node from the list, ignore @value). + * + * Return: 1 if the aggregate constraint value has changed, 0 otherwise. */ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, enum pm_qos_req_action action, int value) { - unsigned long flags; int prev_value, curr_value, new_value; - int ret; + unsigned long flags; spin_lock_irqsave(&pm_qos_lock, flags); + prev_value = pm_qos_get_value(c); if (value == PM_QOS_DEFAULT_VALUE) new_value = c->default_value; @@ -231,9 +115,8 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, break; case PM_QOS_UPDATE_REQ: /* - * to change the list, we atomically remove, reinit - * with new value and add, then see if the extremal - * changed + * To change the list, atomically remove, reinit with new value + * and add, then see if the aggregate has changed. */ plist_del(node, &c->list); /* fall through */ @@ -252,16 +135,14 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, spin_unlock_irqrestore(&pm_qos_lock, flags); trace_pm_qos_update_target(action, prev_value, curr_value); - if (prev_value != curr_value) { - ret = 1; - if (c->notifiers) - blocking_notifier_call_chain(c->notifiers, - (unsigned long)curr_value, - NULL); - } else { - ret = 0; - } - return ret; + + if (prev_value == curr_value) + return 0; + + if (c->notifiers) + blocking_notifier_call_chain(c->notifiers, curr_value, NULL); + + return 1; } /** @@ -283,14 +164,12 @@ static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf, /** * pm_qos_update_flags - Update a set of PM QoS flags. - * @pqf: Set of flags to update. + * @pqf: Set of PM QoS flags to update. * @req: Request to add to the set, to modify, or to remove from the set. * @action: Action to take on the set. * @val: Value of the request to add or modify. * - * Update the given set of PM QoS flags and call notifiers if the aggregate - * value has changed. Returns 1 if the aggregate constraint value has changed, - * 0 otherwise. + * Return: 1 if the aggregate constraint value has changed, 0 otherwise. */ bool pm_qos_update_flags(struct pm_qos_flags *pqf, struct pm_qos_flags_request *req, @@ -326,288 +205,180 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf, spin_unlock_irqrestore(&pm_qos_lock, irqflags); trace_pm_qos_update_flags(action, prev_value, curr_value); - return prev_value != curr_value; -} -/** - * pm_qos_request - returns current system wide qos expectation - * @pm_qos_class: identification of which qos value is requested - * - * This function returns the current target value. - */ -int pm_qos_request(int pm_qos_class) -{ - return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints); -} -EXPORT_SYMBOL_GPL(pm_qos_request); - -int pm_qos_request_active(struct pm_qos_request *req) -{ - return req->pm_qos_class != 0; + return prev_value != curr_value; } -EXPORT_SYMBOL_GPL(pm_qos_request_active); -static void __pm_qos_update_request(struct pm_qos_request *req, - s32 new_value) -{ - trace_pm_qos_update_request(req->pm_qos_class, new_value); +#ifdef CONFIG_CPU_IDLE +/* Definitions related to the CPU latency QoS. */ - if (new_value != req->node.prio) - pm_qos_update_target( - pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_UPDATE_REQ, new_value); -} +static struct pm_qos_constraints cpu_latency_constraints = { + .list = PLIST_HEAD_INIT(cpu_latency_constraints.list), + .target_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, + .default_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, + .type = PM_QOS_MIN, +}; /** - * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout - * @work: work struct for the delayed work (timeout) - * - * This cancels the timeout request by falling back to the default at timeout. + * cpu_latency_qos_limit - Return current system-wide CPU latency QoS limit. */ -static void pm_qos_work_fn(struct work_struct *work) +s32 cpu_latency_qos_limit(void) { - struct pm_qos_request *req = container_of(to_delayed_work(work), - struct pm_qos_request, - work); - - __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); + return pm_qos_read_value(&cpu_latency_constraints); } /** - * pm_qos_add_request - inserts new qos request into the list - * @req: pointer to a preallocated handle - * @pm_qos_class: identifies which list of qos request to use - * @value: defines the qos request + * cpu_latency_qos_request_active - Check the given PM QoS request. + * @req: PM QoS request to check. * - * This function inserts a new entry in the pm_qos_class list of requested qos - * performance characteristics. It recomputes the aggregate QoS expectations - * for the pm_qos_class of parameters and initializes the pm_qos_request - * handle. Caller needs to save this handle for later use in updates and - * removal. + * Return: 'true' if @req has been added to the CPU latency QoS list, 'false' + * otherwise. */ - -void pm_qos_add_request(struct pm_qos_request *req, - int pm_qos_class, s32 value) +bool cpu_latency_qos_request_active(struct pm_qos_request *req) { - if (!req) /*guard against callers passing in null */ - return; + return req->qos == &cpu_latency_constraints; +} +EXPORT_SYMBOL_GPL(cpu_latency_qos_request_active); - if (pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); - return; - } - req->pm_qos_class = pm_qos_class; - INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); - trace_pm_qos_add_request(pm_qos_class, value); - pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, - &req->node, PM_QOS_ADD_REQ, value); +static void cpu_latency_qos_apply(struct pm_qos_request *req, + enum pm_qos_req_action action, s32 value) +{ + int ret = pm_qos_update_target(req->qos, &req->node, action, value); + if (ret > 0) + wake_up_all_idle_cpus(); } -EXPORT_SYMBOL_GPL(pm_qos_add_request); /** - * pm_qos_update_request - modifies an existing qos request - * @req : handle to list element holding a pm_qos request to use - * @value: defines the qos request + * cpu_latency_qos_add_request - Add new CPU latency QoS request. + * @req: Pointer to a preallocated handle. + * @value: Requested constraint value. * - * Updates an existing qos request for the pm_qos_class of parameters along - * with updating the target pm_qos_class value. + * Use @value to initialize the request handle pointed to by @req, insert it as + * a new entry to the CPU latency QoS list and recompute the effective QoS + * constraint for that list. * - * Attempts are made to make this code callable on hot code paths. + * Callers need to save the handle for later use in updates and removal of the + * QoS request represented by it. */ -void pm_qos_update_request(struct pm_qos_request *req, - s32 new_value) +void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value) { - if (!req) /*guard against callers passing in null */ + if (!req) return; - if (!pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); + if (cpu_latency_qos_request_active(req)) { + WARN(1, KERN_ERR "%s called for already added request\n", __func__); return; } - cancel_delayed_work_sync(&req->work); - __pm_qos_update_request(req, new_value); + trace_pm_qos_add_request(value); + + req->qos = &cpu_latency_constraints; + cpu_latency_qos_apply(req, PM_QOS_ADD_REQ, value); } -EXPORT_SYMBOL_GPL(pm_qos_update_request); +EXPORT_SYMBOL_GPL(cpu_latency_qos_add_request); /** - * pm_qos_update_request_timeout - modifies an existing qos request temporarily. - * @req : handle to list element holding a pm_qos request to use - * @new_value: defines the temporal qos request - * @timeout_us: the effective duration of this qos request in usecs. + * cpu_latency_qos_update_request - Modify existing CPU latency QoS request. + * @req : QoS request to update. + * @new_value: New requested constraint value. * - * After timeout_us, this qos request is cancelled automatically. + * Use @new_value to update the QoS request represented by @req in the CPU + * latency QoS list along with updating the effective constraint value for that + * list. */ -void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, - unsigned long timeout_us) +void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value) { if (!req) return; - if (WARN(!pm_qos_request_active(req), - "%s called for unknown object.", __func__)) + + if (!cpu_latency_qos_request_active(req)) { + WARN(1, KERN_ERR "%s called for unknown object\n", __func__); return; + } - cancel_delayed_work_sync(&req->work); + trace_pm_qos_update_request(new_value); - trace_pm_qos_update_request_timeout(req->pm_qos_class, - new_value, timeout_us); - if (new_value != req->node.prio) - pm_qos_update_target( - pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_UPDATE_REQ, new_value); + if (new_value == req->node.prio) + return; - schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us)); + cpu_latency_qos_apply(req, PM_QOS_UPDATE_REQ, new_value); } +EXPORT_SYMBOL_GPL(cpu_latency_qos_update_request); /** - * pm_qos_remove_request - modifies an existing qos request - * @req: handle to request list element + * cpu_latency_qos_remove_request - Remove existing CPU latency QoS request. + * @req: QoS request to remove. * - * Will remove pm qos request from the list of constraints and - * recompute the current target value for the pm_qos_class. Call this - * on slow code paths. + * Remove the CPU latency QoS request represented by @req from the CPU latency + * QoS list along with updating the effective constraint value for that list. */ -void pm_qos_remove_request(struct pm_qos_request *req) +void cpu_latency_qos_remove_request(struct pm_qos_request *req) { - if (!req) /*guard against callers passing in null */ + if (!req) return; - /* silent return to keep pcm code cleaner */ - if (!pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); + if (!cpu_latency_qos_request_active(req)) { + WARN(1, KERN_ERR "%s called for unknown object\n", __func__); return; } - cancel_delayed_work_sync(&req->work); + trace_pm_qos_remove_request(PM_QOS_DEFAULT_VALUE); - trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE); - pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_REMOVE_REQ, - PM_QOS_DEFAULT_VALUE); + cpu_latency_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } -EXPORT_SYMBOL_GPL(pm_qos_remove_request); - -/** - * pm_qos_add_notifier - sets notification entry for changes to target value - * @pm_qos_class: identifies which qos target changes should be notified. - * @notifier: notifier block managed by caller. - * - * will register the notifier into a notification chain that gets called - * upon changes to the pm_qos_class target value. - */ -int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) -{ - int retval; - - retval = blocking_notifier_chain_register( - pm_qos_array[pm_qos_class]->constraints->notifiers, - notifier); - - return retval; -} -EXPORT_SYMBOL_GPL(pm_qos_add_notifier); - -/** - * pm_qos_remove_notifier - deletes notification entry from chain. - * @pm_qos_class: identifies which qos target changes are notified. - * @notifier: notifier block to be removed. - * - * will remove the notifier from the notification chain that gets called - * upon changes to the pm_qos_class target value. - */ -int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) -{ - int retval; +EXPORT_SYMBOL_GPL(cpu_latency_qos_remove_request); - retval = blocking_notifier_chain_unregister( - pm_qos_array[pm_qos_class]->constraints->notifiers, - notifier); +/* User space interface to the CPU latency QoS via misc device. */ - return retval; -} -EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); - -/* User space interface to PM QoS classes via misc devices */ -static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d) +static int cpu_latency_qos_open(struct inode *inode, struct file *filp) { - qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; - qos->pm_qos_power_miscdev.name = qos->name; - qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; - - debugfs_create_file(qos->name, S_IRUGO, d, (void *)qos, - &pm_qos_debug_fops); + struct pm_qos_request *req; - return misc_register(&qos->pm_qos_power_miscdev); -} + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; -static int find_pm_qos_object_by_minor(int minor) -{ - int pm_qos_class; + cpu_latency_qos_add_request(req, PM_QOS_DEFAULT_VALUE); + filp->private_data = req; - for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY; - pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { - if (minor == - pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) - return pm_qos_class; - } - return -1; + return 0; } -static int pm_qos_power_open(struct inode *inode, struct file *filp) +static int cpu_latency_qos_release(struct inode *inode, struct file *filp) { - long pm_qos_class; - - pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); - if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) { - struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) - return -ENOMEM; - - pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); - filp->private_data = req; - - return 0; - } - return -EPERM; -} + struct pm_qos_request *req = filp->private_data; -static int pm_qos_power_release(struct inode *inode, struct file *filp) -{ - struct pm_qos_request *req; + filp->private_data = NULL; - req = filp->private_data; - pm_qos_remove_request(req); + cpu_latency_qos_remove_request(req); kfree(req); return 0; } - -static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, - size_t count, loff_t *f_pos) +static ssize_t cpu_latency_qos_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) { - s32 value; - unsigned long flags; struct pm_qos_request *req = filp->private_data; + unsigned long flags; + s32 value; - if (!req) - return -EINVAL; - if (!pm_qos_request_active(req)) + if (!req || !cpu_latency_qos_request_active(req)) return -EINVAL; spin_lock_irqsave(&pm_qos_lock, flags); - value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints); + value = pm_qos_get_value(&cpu_latency_constraints); spin_unlock_irqrestore(&pm_qos_lock, flags); return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); } -static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, - size_t count, loff_t *f_pos) +static ssize_t cpu_latency_qos_write(struct file *filp, const char __user *buf, + size_t count, loff_t *f_pos) { s32 value; - struct pm_qos_request *req; if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) @@ -620,36 +391,38 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, return ret; } - req = filp->private_data; - pm_qos_update_request(req, value); + cpu_latency_qos_update_request(filp->private_data, value); return count; } +static const struct file_operations cpu_latency_qos_fops = { + .write = cpu_latency_qos_write, + .read = cpu_latency_qos_read, + .open = cpu_latency_qos_open, + .release = cpu_latency_qos_release, + .llseek = noop_llseek, +}; -static int __init pm_qos_power_init(void) -{ - int ret = 0; - int i; - struct dentry *d; - - BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); +static struct miscdevice cpu_latency_qos_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cpu_dma_latency", + .fops = &cpu_latency_qos_fops, +}; - d = debugfs_create_dir("pm_qos", NULL); +static int __init cpu_latency_qos_init(void) +{ + int ret; - for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { - ret = register_pm_qos_misc(pm_qos_array[i], d); - if (ret < 0) { - pr_err("%s: %s setup failed\n", - __func__, pm_qos_array[i]->name); - return ret; - } - } + ret = misc_register(&cpu_latency_qos_miscdev); + if (ret < 0) + pr_err("%s: %s setup failed\n", __func__, + cpu_latency_qos_miscdev.name); return ret; } - -late_initcall(pm_qos_power_init); +late_initcall(cpu_latency_qos_init); +#endif /* CONFIG_CPU_IDLE */ /* Definitions related to the frequency QoS below. */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d82b7b88d616..659800157b17 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1744,9 +1744,6 @@ int hibernate_preallocate_memory(void) count += highmem; count -= totalreserve_pages; - /* Add number of pages required for page keys (s390 only). */ - size += page_key_additional_pages(saveable); - /* Compute the maximum number of saveable pages to leave in memory. */ max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); @@ -2075,8 +2072,6 @@ static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm) buf[j] = memory_bm_next_pfn(bm); if (unlikely(buf[j] == BM_END_OF_MAP)) break; - /* Save page key for data page (s390 only). */ - page_key_read(buf + j); } } @@ -2226,9 +2221,6 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) if (unlikely(buf[j] == BM_END_OF_MAP)) break; - /* Extract and buffer page key for data page (s390 only). */ - page_key_memorize(buf + j); - if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) memory_bm_set_bit(bm, buf[j]); else @@ -2623,11 +2615,6 @@ int snapshot_write_next(struct snapshot_handle *handle) if (error) return error; - /* Allocate buffer for page keys. */ - error = page_key_alloc(nr_copy_pages); - if (error) - return error; - hibernate_restore_protection_begin(); } else if (handle->cur <= nr_meta_pages + 1) { error = unpack_orig_pfns(buffer, ©_bm); @@ -2649,8 +2636,6 @@ int snapshot_write_next(struct snapshot_handle *handle) } } else { copy_last_highmem_page(); - /* Restore page key for data page (s390 only). */ - page_key_write(handle->buffer); hibernate_restore_protect_page(handle->buffer); handle->buffer = get_buffer(&orig_bm, &ca); if (IS_ERR(handle->buffer)) @@ -2673,9 +2658,6 @@ int snapshot_write_next(struct snapshot_handle *handle) void snapshot_write_finalize(struct snapshot_handle *handle) { copy_last_highmem_page(); - /* Restore page key for data page (s390 only). */ - page_key_write(handle->buffer); - page_key_free(); hibernate_restore_protect_page(handle->buffer); /* Do that only if we have loaded the image entirely */ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { diff --git a/kernel/power/user.c b/kernel/power/user.c index 77438954cc2b..7959449765d9 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -27,8 +27,6 @@ #include "power.h" -#define SNAPSHOT_MINOR 231 - static struct snapshot_data { struct snapshot_handle handle; int swap; @@ -198,6 +196,50 @@ unlock: return res; } +struct compat_resume_swap_area { + compat_loff_t offset; + u32 dev; +} __packed; + +static int snapshot_set_swap_area(struct snapshot_data *data, + void __user *argp) +{ + sector_t offset; + dev_t swdev; + + if (swsusp_swap_in_use()) + return -EPERM; + + if (in_compat_syscall()) { + struct compat_resume_swap_area swap_area; + + if (copy_from_user(&swap_area, argp, sizeof(swap_area))) + return -EFAULT; + swdev = new_decode_dev(swap_area.dev); + offset = swap_area.offset; + } else { + struct resume_swap_area swap_area; + + if (copy_from_user(&swap_area, argp, sizeof(swap_area))) + return -EFAULT; + swdev = new_decode_dev(swap_area.dev); + offset = swap_area.offset; + } + + /* + * User space encodes device types as two-byte values, + * so we need to recode them + */ + if (!swdev) { + data->swap = -1; + return -EINVAL; + } + data->swap = swap_type_of(swdev, offset, NULL); + if (data->swap < 0) + return -ENODEV; + return 0; +} + static long snapshot_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -353,34 +395,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; case SNAPSHOT_SET_SWAP_AREA: - if (swsusp_swap_in_use()) { - error = -EPERM; - } else { - struct resume_swap_area swap_area; - dev_t swdev; - - error = copy_from_user(&swap_area, (void __user *)arg, - sizeof(struct resume_swap_area)); - if (error) { - error = -EFAULT; - break; - } - - /* - * User space encodes device types as two-byte values, - * so we need to recode them - */ - swdev = new_decode_dev(swap_area.dev); - if (swdev) { - offset = swap_area.offset; - data->swap = swap_type_of(swdev, offset, NULL); - if (data->swap < 0) - error = -ENODEV; - } else { - data->swap = -1; - error = -EINVAL; - } - } + error = snapshot_set_swap_area(data, (void __user *)arg); break; default: @@ -395,12 +410,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } #ifdef CONFIG_COMPAT - -struct compat_resume_swap_area { - compat_loff_t offset; - u32 dev; -} __packed; - static long snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -409,49 +418,15 @@ snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) switch (cmd) { case SNAPSHOT_GET_IMAGE_SIZE: case SNAPSHOT_AVAIL_SWAP_SIZE: - case SNAPSHOT_ALLOC_SWAP_PAGE: { - compat_loff_t __user *uoffset = compat_ptr(arg); - loff_t offset; - mm_segment_t old_fs; - int err; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = snapshot_ioctl(file, cmd, (unsigned long) &offset); - set_fs(old_fs); - if (!err && put_user(offset, uoffset)) - err = -EFAULT; - return err; - } - + case SNAPSHOT_ALLOC_SWAP_PAGE: case SNAPSHOT_CREATE_IMAGE: + case SNAPSHOT_SET_SWAP_AREA: return snapshot_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); - - case SNAPSHOT_SET_SWAP_AREA: { - struct compat_resume_swap_area __user *u_swap_area = - compat_ptr(arg); - struct resume_swap_area swap_area; - mm_segment_t old_fs; - int err; - - err = get_user(swap_area.offset, &u_swap_area->offset); - err |= get_user(swap_area.dev, &u_swap_area->dev); - if (err) - return -EFAULT; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA, - (unsigned long) &swap_area); - set_fs(old_fs); - return err; - } - default: return snapshot_ioctl(file, cmd, arg); } } - #endif /* CONFIG_COMPAT */ static const struct file_operations snapshot_fops = { diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index c8e6ab689d42..b2b0f526f249 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -23,6 +23,9 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args); void __printk_safe_enter(void); void __printk_safe_exit(void); +void printk_safe_init(void); +bool printk_percpu_data_ready(void); + #define printk_safe_enter_irqsave(flags) \ do { \ local_irq_save(flags); \ @@ -64,4 +67,6 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } #define printk_safe_enter_irq() local_irq_disable() #define printk_safe_exit_irq() local_irq_enable() +static inline void printk_safe_init(void) { } +static inline bool printk_percpu_data_ready(void) { return false; } #endif /* CONFIG_PRINTK */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fada22dc4ab6..9a9b6156270b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -460,6 +460,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; +/* + * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before + * per_cpu_areas are initialised. This variable is set to true when + * it's safe to access per-CPU data. + */ +static bool __printk_percpu_data_ready __read_mostly; + +bool printk_percpu_data_ready(void) +{ + return __printk_percpu_data_ready; +} + /* Return log buffer address */ char *log_buf_addr_get(void) { @@ -1146,12 +1158,28 @@ static void __init log_buf_add_cpu(void) static inline void log_buf_add_cpu(void) {} #endif /* CONFIG_SMP */ +static void __init set_percpu_data_ready(void) +{ + printk_safe_init(); + /* Make sure we set this flag only after printk_safe() init is done */ + barrier(); + __printk_percpu_data_ready = true; +} + void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; unsigned int free; + /* + * Some archs call setup_log_buf() multiple times - first is very + * early, e.g. from setup_arch(), and second - when percpu_areas + * are initialised. + */ + if (!early) + set_percpu_data_ready(); + if (log_buf != __log_buf) return; @@ -1772,9 +1800,6 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, trace_console_rcuidle(text, len); - if (!console_drivers) - return; - for_each_console(con) { if (exclusive_console && con != exclusive_console) continue; @@ -2653,19 +2678,17 @@ void register_console(struct console *newcon) struct console_cmdline *c; static bool has_preferred; - if (console_drivers) - for_each_console(bcon) - if (WARN(bcon == newcon, - "console '%s%d' already registered\n", - bcon->name, bcon->index)) - return; + for_each_console(bcon) { + if (WARN(bcon == newcon, "console '%s%d' already registered\n", + bcon->name, bcon->index)) + return; + } /* * before we register a new CON_BOOT console, make sure we don't * already have a valid console */ - if (console_drivers && newcon->flags & CON_BOOT) { - /* find the last or real console */ + if (newcon->flags & CON_BOOT) { for_each_console(bcon) { if (!(bcon->flags & CON_BOOT)) { pr_info("Too late to register bootconsole %s%d\n", @@ -2813,7 +2836,7 @@ EXPORT_SYMBOL(register_console); int unregister_console(struct console *console) { - struct console *a, *b; + struct console *con; int res; pr_info("%sconsole [%s%d] disabled\n", @@ -2821,26 +2844,30 @@ int unregister_console(struct console *console) console->name, console->index); res = _braille_unregister_console(console); - if (res) + if (res < 0) return res; + if (res > 0) + return 0; - res = 1; + res = -ENODEV; console_lock(); if (console_drivers == console) { console_drivers=console->next; res = 0; - } else if (console_drivers) { - for (a=console_drivers->next, b=console_drivers ; - a; b=a, a=b->next) { - if (a == console) { - b->next = a->next; + } else { + for_each_console(con) { + if (con->next == console) { + con->next = console->next; res = 0; break; } } } - if (!res && (console->flags & CON_EXTENDED)) + if (res) + goto out_disable_unlock; + + if (console->flags & CON_EXTENDED) nr_ext_console_drivers--; /* @@ -2853,6 +2880,16 @@ int unregister_console(struct console *console) console->flags &= ~CON_ENABLED; console_unlock(); console_sysfs_notify(); + + if (console->exit) + res = console->exit(console); + + return res; + +out_disable_unlock: + console->flags &= ~CON_ENABLED; + console_unlock(); + return res; } EXPORT_SYMBOL(unregister_console); @@ -2966,6 +3003,9 @@ static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { void wake_up_klogd(void) { + if (!printk_percpu_data_ready()) + return; + preempt_disable(); if (waitqueue_active(&log_wait)) { this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); @@ -2976,6 +3016,9 @@ void wake_up_klogd(void) void defer_console_output(void) { + if (!printk_percpu_data_ready()) + return; + preempt_disable(); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index b4045e782743..d9a659a686f3 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -27,7 +27,6 @@ * There are situations when we want to make sure that all buffers * were handled or when IRQs are blocked. */ -static int printk_safe_irq_ready __read_mostly; #define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \ sizeof(atomic_t) - \ @@ -51,7 +50,7 @@ static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq); /* Get flushed in a more safe context. */ static void queue_flush_work(struct printk_safe_seq_buf *s) { - if (printk_safe_irq_ready) + if (printk_percpu_data_ready()) irq_work_queue(&s->work); } @@ -402,14 +401,6 @@ void __init printk_safe_init(void) #endif } - /* - * In the highly unlikely event that a NMI were to trigger at - * this moment. Make sure IRQ work is set up before this - * variable is set. - */ - barrier(); - printk_safe_irq_ready = 1; - /* Flush pending messages that did not have scheduled IRQ works. */ printk_safe_flush(); } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 1cc940fef17c..0ebe15a84985 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -70,13 +70,37 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config TASKS_RCU_GENERIC + def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU + select SRCU + help + This option enables generic infrastructure code supporting + task-based RCU implementations. Not for manual selection. + config TASKS_RCU def_bool PREEMPTION - select SRCU help This option enables a task-based RCU implementation that uses only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. + user-mode execution as quiescent states. Not for manual selection. + +config TASKS_RUDE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + only context switch (including preemption) and user-mode + execution as quiescent states. It forces IPIs and context + switches on all online CPUs, including idle ones, so use + with caution. + +config TASKS_TRACE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + explicit rcu_read_lock_trace() read-side markers, and allows + these readers to appear in the idle loop as well as on the CPU + hotplug code paths. It can force IPIs on online CPUs, including + idle ones, so use with caution. config RCU_STALL_COMMON def_bool TREE_RCU @@ -210,4 +234,22 @@ config RCU_NOCB_CPU Say Y here if you want to help to debug reduced OS jitter. Say N here if you are unsure. +config TASKS_TRACE_RCU_READ_MB + bool "Tasks Trace RCU readers use memory barriers in user and idle" + depends on RCU_EXPERT + default PREEMPT_RT || NR_CPUS < 8 + help + Use this option to further reduce the number of IPIs sent + to CPUs executing in userspace or idle during tasks trace + RCU grace periods. Given that a reasonable setting of + the rcupdate.rcu_task_ipi_delay kernel boot parameter + eliminates such IPIs for many workloads, proper setting + of this Kconfig option is important mostly for aggressive + real-time installations and for battery-powered devices, + hence the default chosen above. + + Say Y here if you hate IPIs. + Say N here if you hate read-side memory barriers. + Take the default if you are unsure. + endmenu # "RCU Subsystem" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 4aa02eee8f6c..452feae8de20 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -29,6 +29,8 @@ config RCU_PERF_TEST select TORTURE_TEST select SRCU select TASKS_RCU + select TASKS_RUDE_RCU + select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance @@ -46,6 +48,8 @@ config RCU_TORTURE_TEST select TORTURE_TEST select SRCU select TASKS_RCU + select TASKS_RUDE_RCU + select TASKS_TRACE_RCU default n help This option provides a kernel module that runs torture tests diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 00ddc92c5774..cf66a3ccd757 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -431,6 +431,7 @@ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); +void show_rcu_tasks_gp_kthreads(void); void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ @@ -441,6 +442,8 @@ void rcu_request_urgent_qs_task(struct task_struct *t); enum rcutorture_type { RCU_FLAVOR, RCU_TASKS_FLAVOR, + RCU_TASKS_RUDE_FLAVOR, + RCU_TASKS_TRACING_FLAVOR, RCU_TRIVIAL_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR @@ -454,6 +457,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long secs, unsigned long c_old, unsigned long c); +void rcu_gp_set_torture_wait(int duration); #else static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq) @@ -471,6 +475,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ do { } while (0) #endif +static inline void rcu_gp_set_torture_wait(int duration) { } #endif #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) @@ -498,6 +503,7 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif #ifdef CONFIG_TINY_RCU +static inline bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } static inline unsigned long @@ -507,6 +513,7 @@ static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } static inline void rcu_fwd_progress_check(unsigned long j) { } #else /* #ifdef CONFIG_TINY_RCU */ +bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index a4a8d097d84d..16dd1e6b7c09 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -88,6 +88,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN, torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); +torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -635,7 +636,7 @@ kfree_perf_thread(void *arg) } for (i = 0; i < kfree_alloc_num; i++) { - alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL); + alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL); if (!alloc_ptr) return -ENOMEM; @@ -722,6 +723,8 @@ kfree_perf_init(void) schedule_timeout_uninterruptible(1); } + pr_alert("kfree object size=%lu\n", kfree_mult * sizeof(struct kfree_obj)); + kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), GFP_KERNEL); if (kfree_reader_tasks == NULL) { diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5453bd557f43..efb792e13fca 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -20,7 +20,7 @@ #include <linux/err.h> #include <linux/spinlock.h> #include <linux/smp.h> -#include <linux/rcupdate.h> +#include <linux/rcupdate_wait.h> #include <linux/interrupt.h> #include <linux/sched/signal.h> #include <uapi/linux/sched/types.h> @@ -45,12 +45,25 @@ #include <linux/sched/sysctl.h> #include <linux/oom.h> #include <linux/tick.h> +#include <linux/rcupdate_trace.h> #include "rcu.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ @@ -102,6 +115,9 @@ torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); +torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); +torture_param(int, stall_gp_kthread, 0, + "Grace-period kthread stall duration (s)."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -665,6 +681,11 @@ static void rcu_tasks_torture_deferred_free(struct rcu_torture *p) call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb); } +static void synchronize_rcu_mult_test(void) +{ + synchronize_rcu_mult(call_rcu_tasks, call_rcu); +} + static struct rcu_torture_ops tasks_ops = { .ttype = RCU_TASKS_FLAVOR, .init = rcu_sync_torture_init, @@ -674,7 +695,7 @@ static struct rcu_torture_ops tasks_ops = { .get_gp_seq = rcu_no_completed, .deferred_free = rcu_tasks_torture_deferred_free, .sync = synchronize_rcu_tasks, - .exp_sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_mult_test, .call = call_rcu_tasks, .cb_barrier = rcu_barrier_tasks, .fqs = NULL, @@ -725,6 +746,72 @@ static struct rcu_torture_ops trivial_ops = { .name = "trivial" }; +/* + * Definitions for rude RCU-tasks torture testing. + */ + +static void rcu_tasks_rude_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks_rude(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_rude_ops = { + .ttype = RCU_TASKS_RUDE_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .get_gp_seq = rcu_no_completed, + .deferred_free = rcu_tasks_rude_torture_deferred_free, + .sync = synchronize_rcu_tasks_rude, + .exp_sync = synchronize_rcu_tasks_rude, + .call = call_rcu_tasks_rude, + .cb_barrier = rcu_barrier_tasks_rude, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "tasks-rude" +}; + +/* + * Definitions for tracing RCU-tasks torture testing. + */ + +static int tasks_tracing_torture_read_lock(void) +{ + rcu_read_lock_trace(); + return 0; +} + +static void tasks_tracing_torture_read_unlock(int idx) +{ + rcu_read_unlock_trace(); +} + +static void rcu_tasks_tracing_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks_trace(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_tracing_ops = { + .ttype = RCU_TASKS_TRACING_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = tasks_tracing_torture_read_lock, + .read_delay = srcu_read_delay, /* just reuse srcu's version. */ + .readunlock = tasks_tracing_torture_read_unlock, + .get_gp_seq = rcu_no_completed, + .deferred_free = rcu_tasks_tracing_torture_deferred_free, + .sync = synchronize_rcu_tasks_trace, + .exp_sync = synchronize_rcu_tasks_trace, + .call = call_rcu_tasks_trace, + .cb_barrier = rcu_barrier_tasks_trace, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .slow_gps = 1, + .name = "tasks-tracing" +}; + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -734,7 +821,7 @@ static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) static bool __maybe_unused torturing_tasks(void) { - return cur_ops == &tasks_ops; + return cur_ops == &tasks_ops || cur_ops == &tasks_rude_ops; } /* @@ -833,7 +920,7 @@ static int rcu_torture_boost(void *arg) /* Wait for the next test interval. */ oldstarttime = boost_starttime; - while (ULONG_CMP_LT(jiffies, oldstarttime)) { + while (time_before(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) @@ -843,7 +930,7 @@ static int rcu_torture_boost(void *arg) /* Do one boost-test interval. */ endtime = oldstarttime + test_boost_duration * HZ; call_rcu_time = jiffies; - while (ULONG_CMP_LT(jiffies, endtime)) { + while (time_before(jiffies, endtime)) { /* If we don't have a callback in flight, post one. */ if (!smp_load_acquire(&rbi.inflight)) { /* RCU core before ->inflight = 1. */ @@ -914,7 +1001,7 @@ rcu_torture_fqs(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_fqs task started"); do { fqs_resume_time = jiffies + fqs_stutter * HZ; - while (ULONG_CMP_LT(jiffies, fqs_resume_time) && + while (time_before(jiffies, fqs_resume_time) && !kthread_should_stop()) { schedule_timeout_interruptible(1); } @@ -1147,6 +1234,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { + unsigned long flags; int idxnew = -1; int idxold = *readstate; int statesnew = ~*readstate & newstate; @@ -1181,8 +1269,15 @@ static void rcutorture_one_extend(int *readstate, int newstate, rcu_read_unlock_bh(); if (statesold & RCUTORTURE_RDR_SCHED) rcu_read_unlock_sched(); - if (statesold & RCUTORTURE_RDR_RCU) + if (statesold & RCUTORTURE_RDR_RCU) { + bool lockit = !statesnew && !(torture_random(trsp) & 0xffff); + + if (lockit) + raw_spin_lock_irqsave(¤t->pi_lock, flags); cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); + if (lockit) + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + } /* Delay if neither beginning nor end and there was a change. */ if ((statesnew || statesold) && *readstate && newstate) @@ -1283,6 +1378,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(srcu_ctlp) || + rcu_read_lock_trace_held() || torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ @@ -1444,9 +1540,9 @@ rcu_torture_stats_print(void) atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); pr_cont("barrier: %ld/%ld:%ld\n", - n_barrier_successes, - n_barrier_attempts, - n_rcu_torture_barrier_error); + data_race(n_barrier_successes), + data_race(n_barrier_attempts), + data_race(n_rcu_torture_barrier_error)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || @@ -1536,6 +1632,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " + "stall_cpu_block=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, nrealreaders, nfakewriters, @@ -1544,6 +1641,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, + stall_cpu_block, n_barrier_cbs, onoff_interval, onoff_holdoff); } @@ -1599,6 +1697,7 @@ static int rcutorture_booster_init(unsigned int cpu) */ static int rcu_torture_stall(void *args) { + int idx; unsigned long stop_at; VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); @@ -1607,26 +1706,37 @@ static int rcu_torture_stall(void *args) schedule_timeout_interruptible(stall_cpu_holdoff * HZ); VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff"); } - if (!kthread_should_stop()) { + if (!kthread_should_stop() && stall_gp_kthread > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin GP stall"); + rcu_gp_set_torture_wait(stall_gp_kthread * HZ); + for (idx = 0; idx < stall_gp_kthread + 2; idx++) { + if (kthread_should_stop()) + break; + schedule_timeout_uninterruptible(HZ); + } + } + if (!kthread_should_stop() && stall_cpu > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin CPU stall"); stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ - rcu_read_lock(); + idx = cur_ops->readlock(); if (stall_cpu_irqsoff) local_irq_disable(); - else + else if (!stall_cpu_block) preempt_disable(); pr_alert("rcu_torture_stall start on CPU %d.\n", - smp_processor_id()); + raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at)) - continue; /* Induce RCU CPU stall warning. */ + if (stall_cpu_block) + schedule_timeout_uninterruptible(HZ); if (stall_cpu_irqsoff) local_irq_enable(); - else + else if (!stall_cpu_block) preempt_enable(); - rcu_read_unlock(); - pr_alert("rcu_torture_stall end.\n"); + cur_ops->readunlock(idx); } + pr_alert("rcu_torture_stall end.\n"); torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); @@ -1636,7 +1746,7 @@ static int rcu_torture_stall(void *args) /* Spawn CPU-stall kthread, if stall_cpu specified. */ static int __init rcu_torture_stall_init(void) { - if (stall_cpu <= 0) + if (stall_cpu <= 0 && stall_gp_kthread <= 0) return 0; return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } @@ -1692,8 +1802,8 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; -struct rcu_fwd *rcu_fwds; -bool rcu_fwd_emergency_stop; +static struct rcu_fwd *rcu_fwds; +static bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) { @@ -2400,7 +2510,8 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, &trivial_ops, + &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, + &tasks_tracing_ops, &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0c71505f0e19..6d3ef700fb0e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -29,6 +29,19 @@ #include "rcu.h" #include "rcu_segcblist.h" +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + /* Holdoff in nanoseconds for auto-expediting. */ #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; @@ -1268,8 +1281,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = sdp->srcu_unlock_count[!idx]; - u1 = sdp->srcu_unlock_count[idx]; + u0 = data_race(sdp->srcu_unlock_count[!idx]); + u1 = data_race(sdp->srcu_unlock_count[idx]); /* * Make sure that a lock is always counted if the corresponding @@ -1277,8 +1290,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = sdp->srcu_lock_count[!idx]; - l1 = sdp->srcu_lock_count[idx]; + l0 = data_race(sdp->srcu_lock_count[!idx]); + l1 = data_race(sdp->srcu_lock_count[idx]); c0 = l0 - u0; c1 = l1 - u1; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h new file mode 100644 index 000000000000..ce23f6cc5043 --- /dev/null +++ b/kernel/rcu/tasks.h @@ -0,0 +1,1193 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Task-based RCU implementations. + * + * Copyright (C) 2020 Paul E. McKenney + */ + +#ifdef CONFIG_TASKS_RCU_GENERIC + +//////////////////////////////////////////////////////////////////////// +// +// Generic data structures. + +struct rcu_tasks; +typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); +typedef void (*pregp_func_t)(void); +typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop); +typedef void (*postscan_func_t)(struct list_head *hop); +typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); +typedef void (*postgp_func_t)(struct rcu_tasks *rtp); + +/** + * Definition for a Tasks-RCU-like mechanism. + * @cbs_head: Head of callback list. + * @cbs_tail: Tail pointer for callback list. + * @cbs_wq: Wait queue allowning new callback to get kthread's attention. + * @cbs_lock: Lock protecting callback list. + * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + * @gp_func: This flavor's grace-period-wait function. + * @gp_state: Grace period's most recent state transition (debugging). + * @gp_jiffies: Time of last @gp_state transition. + * @gp_start: Most recent grace-period start in jiffies. + * @n_gps: Number of grace periods completed since boot. + * @n_ipis: Number of IPIs sent to encourage grace periods to end. + * @n_ipis_fails: Number of IPI-send failures. + * @pregp_func: This flavor's pre-grace-period function (optional). + * @pertask_func: This flavor's per-task scan function (optional). + * @postscan_func: This flavor's post-task scan function (optional). + * @holdout_func: This flavor's holdout-list scan function (optional). + * @postgp_func: This flavor's post-grace-period function (optional). + * @call_func: This flavor's call_rcu()-equivalent function. + * @name: This flavor's textual name. + * @kname: This flavor's kthread name. + */ +struct rcu_tasks { + struct rcu_head *cbs_head; + struct rcu_head **cbs_tail; + struct wait_queue_head cbs_wq; + raw_spinlock_t cbs_lock; + int gp_state; + unsigned long gp_jiffies; + unsigned long gp_start; + unsigned long n_gps; + unsigned long n_ipis; + unsigned long n_ipis_fails; + struct task_struct *kthread_ptr; + rcu_tasks_gp_func_t gp_func; + pregp_func_t pregp_func; + pertask_func_t pertask_func; + postscan_func_t postscan_func; + holdouts_func_t holdouts_func; + postgp_func_t postgp_func; + call_rcu_func_t call_func; + char *name; + char *kname; +}; + +#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ +static struct rcu_tasks rt_name = \ +{ \ + .cbs_tail = &rt_name.cbs_head, \ + .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ + .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock), \ + .gp_func = gp, \ + .call_func = call, \ + .name = n, \ + .kname = #rt_name, \ +} + +/* Track exiting tasks in order to allow them to be waited for. */ +DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); + +/* Avoid IPIing CPUs early in the grace period. */ +#define RCU_TASK_IPI_DELAY (HZ / 2) +static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; +module_param(rcu_task_ipi_delay, int, 0644); + +/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ +#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) +static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; +module_param(rcu_task_stall_timeout, int, 0644); + +/* RCU tasks grace-period state for debugging. */ +#define RTGS_INIT 0 +#define RTGS_WAIT_WAIT_CBS 1 +#define RTGS_WAIT_GP 2 +#define RTGS_PRE_WAIT_GP 3 +#define RTGS_SCAN_TASKLIST 4 +#define RTGS_POST_SCAN_TASKLIST 5 +#define RTGS_WAIT_SCAN_HOLDOUTS 6 +#define RTGS_SCAN_HOLDOUTS 7 +#define RTGS_POST_GP 8 +#define RTGS_WAIT_READERS 9 +#define RTGS_INVOKE_CBS 10 +#define RTGS_WAIT_CBS 11 +static const char * const rcu_tasks_gp_state_names[] = { + "RTGS_INIT", + "RTGS_WAIT_WAIT_CBS", + "RTGS_WAIT_GP", + "RTGS_PRE_WAIT_GP", + "RTGS_SCAN_TASKLIST", + "RTGS_POST_SCAN_TASKLIST", + "RTGS_WAIT_SCAN_HOLDOUTS", + "RTGS_SCAN_HOLDOUTS", + "RTGS_POST_GP", + "RTGS_WAIT_READERS", + "RTGS_INVOKE_CBS", + "RTGS_WAIT_CBS", +}; + +//////////////////////////////////////////////////////////////////////// +// +// Generic code. + +/* Record grace-period phase and time. */ +static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate) +{ + rtp->gp_state = newstate; + rtp->gp_jiffies = jiffies; +} + +/* Return state name. */ +static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) +{ + int i = data_race(rtp->gp_state); // Let KCSAN detect update races + int j = READ_ONCE(i); // Prevent the compiler from reading twice + + if (j >= ARRAY_SIZE(rcu_tasks_gp_state_names)) + return "???"; + return rcu_tasks_gp_state_names[j]; +} + +// Enqueue a callback for the specified flavor of Tasks RCU. +static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, + struct rcu_tasks *rtp) +{ + unsigned long flags; + bool needwake; + + rhp->next = NULL; + rhp->func = func; + raw_spin_lock_irqsave(&rtp->cbs_lock, flags); + needwake = !rtp->cbs_head; + WRITE_ONCE(*rtp->cbs_tail, rhp); + rtp->cbs_tail = &rhp->next; + raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); + /* We can't create the thread unless interrupts are enabled. */ + if (needwake && READ_ONCE(rtp->kthread_ptr)) + wake_up(&rtp->cbs_wq); +} + +// Wait for a grace period for the specified flavor of Tasks RCU. +static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) +{ + /* Complain if the scheduler has not started. */ + RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, + "synchronize_rcu_tasks called too soon"); + + /* Wait for the grace period. */ + wait_rcu_gp(rtp->call_func); +} + +/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ +static int __noreturn rcu_tasks_kthread(void *arg) +{ + unsigned long flags; + struct rcu_head *list; + struct rcu_head *next; + struct rcu_tasks *rtp = arg; + + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ + housekeeping_affine(current, HK_FLAG_RCU); + WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start! + + /* + * Each pass through the following loop makes one check for + * newly arrived callbacks, and, if there are some, waits for + * one RCU-tasks grace period and then invokes the callbacks. + * This loop is terminated by the system going down. ;-) + */ + for (;;) { + + /* Pick up any new callbacks. */ + raw_spin_lock_irqsave(&rtp->cbs_lock, flags); + smp_mb__after_spinlock(); // Order updates vs. GP. + list = rtp->cbs_head; + rtp->cbs_head = NULL; + rtp->cbs_tail = &rtp->cbs_head; + raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); + + /* If there were none, wait a bit and start over. */ + if (!list) { + wait_event_interruptible(rtp->cbs_wq, + READ_ONCE(rtp->cbs_head)); + if (!rtp->cbs_head) { + WARN_ON(signal_pending(current)); + set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); + schedule_timeout_interruptible(HZ/10); + } + continue; + } + + // Wait for one grace period. + set_tasks_gp_state(rtp, RTGS_WAIT_GP); + rtp->gp_start = jiffies; + rtp->gp_func(rtp); + rtp->n_gps++; + + /* Invoke the callbacks. */ + set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); + while (list) { + next = list->next; + local_bh_disable(); + list->func(list); + local_bh_enable(); + list = next; + cond_resched(); + } + /* Paranoid sleep to keep this from entering a tight loop */ + schedule_timeout_uninterruptible(HZ/10); + + set_tasks_gp_state(rtp, RTGS_WAIT_CBS); + } +} + +/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */ +static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) +{ + struct task_struct *t; + + t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name)) + return; + smp_mb(); /* Ensure others see full kthread. */ +} + +#ifndef CONFIG_TINY_RCU + +/* + * Print any non-default Tasks RCU settings. + */ +static void __init rcu_tasks_bootup_oddness(void) +{ +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) + pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); +#endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RCU + pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RUDE_RCU + pr_info("\tRude variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + pr_info("\tTracing variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ +} + +#endif /* #ifndef CONFIG_TINY_RCU */ + +/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ +static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) +{ + pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", + rtp->kname, + tasks_gp_state_getname(rtp), data_race(rtp->gp_state), + jiffies - data_race(rtp->gp_jiffies), + data_race(rtp->n_gps), + data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), + ".k"[!!data_race(rtp->kthread_ptr)], + ".C"[!!data_race(rtp->cbs_head)], + s); +} + +static void exit_tasks_rcu_finish_trace(struct task_struct *t); + +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + +//////////////////////////////////////////////////////////////////////// +// +// Shared code between task-list-scanning variants of Tasks RCU. + +/* Wait for one RCU-tasks grace period. */ +static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) +{ + struct task_struct *g, *t; + unsigned long lastreport; + LIST_HEAD(holdouts); + int fract; + + set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP); + rtp->pregp_func(); + + /* + * There were callbacks, so we need to wait for an RCU-tasks + * grace period. Start off by scanning the task list for tasks + * that are not already voluntarily blocked. Mark these tasks + * and make a list of them in holdouts. + */ + set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST); + rcu_read_lock(); + for_each_process_thread(g, t) + rtp->pertask_func(t, &holdouts); + rcu_read_unlock(); + + set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST); + rtp->postscan_func(&holdouts); + + /* + * Each pass through the following loop scans the list of holdout + * tasks, removing any that are no longer holdouts. When the list + * is empty, we are done. + */ + lastreport = jiffies; + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */ + fract = 10; + + for (;;) { + bool firstreport; + bool needreport; + int rtst; + + if (list_empty(&holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + + rtst = READ_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); + if (needreport) + lastreport = jiffies; + firstreport = true; + WARN_ON(signal_pending(current)); + set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS); + rtp->holdouts_func(&holdouts, needreport, &firstreport); + } + + set_tasks_gp_state(rtp, RTGS_POST_GP); + rtp->postgp_func(rtp); +} + +#endif /* #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) */ + +#ifdef CONFIG_TASKS_RCU + +//////////////////////////////////////////////////////////////////////// +// +// Simple variant of RCU whose quiescent states are voluntary context +// switch, cond_resched_rcu_qs(), user-space execution, and idle. +// As such, grace periods can take one good long time. There are no +// read-side primitives similar to rcu_read_lock() and rcu_read_unlock() +// because this implementation is intended to get the system into a safe +// state for some of the manipulations involved in tracing and the like. +// Finally, this implementation does not support high call_rcu_tasks() +// rates from multiple CPUs. If this is required, per-CPU callback lists +// will be needed. + +/* Pre-grace-period preparation. */ +static void rcu_tasks_pregp_step(void) +{ + /* + * Wait for all pre-existing t->on_rq and t->nvcsw transitions + * to complete. Invoking synchronize_rcu() suffices because all + * these transitions occur with interrupts disabled. Without this + * synchronize_rcu(), a read-side critical section that started + * before the grace period might be incorrectly seen as having + * started after the grace period. + * + * This synchronize_rcu() also dispenses with the need for a + * memory barrier on the first store to t->rcu_tasks_holdout, + * as it forces the store to happen after the beginning of the + * grace period. + */ + synchronize_rcu(); +} + +/* Per-task initial processing. */ +static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) +{ + if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); + WRITE_ONCE(t->rcu_tasks_holdout, true); + list_add(&t->rcu_tasks_holdout_list, hop); + } +} + +/* Processing between scanning taskslist and draining the holdout list. */ +void rcu_tasks_postscan(struct list_head *hop) +{ + /* + * Wait for tasks that are in the process of exiting. This + * does only part of the job, ensuring that all tasks that were + * previously exiting reach the point where they have disabled + * preemption, allowing the later synchronize_rcu() to finish + * the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); +} + +/* See if tasks are still holding out, complain if so. */ +static void check_holdout_task(struct task_struct *t, + bool needreport, bool *firstreport) +{ + int cpu; + + if (!READ_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || + !READ_ONCE(t->on_rq) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { + WRITE_ONCE(t->rcu_tasks_holdout, false); + list_del_init(&t->rcu_tasks_holdout_list); + put_task_struct(t); + return; + } + rcu_request_urgent_qs_task(t); + if (!needreport) + return; + if (*firstreport) { + pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); + *firstreport = false; + } + cpu = task_cpu(t); + pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", + t, ".I"[is_idle_task(t)], + "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], + t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, + t->rcu_tasks_idle_cpu, cpu); + sched_show_task(t); +} + +/* Scan the holdout lists for tasks no longer holding out. */ +static void check_all_holdout_tasks(struct list_head *hop, + bool needreport, bool *firstreport) +{ + struct task_struct *t, *t1; + + list_for_each_entry_safe(t, t1, hop, rcu_tasks_holdout_list) { + check_holdout_task(t, needreport, firstreport); + cond_resched(); + } +} + +/* Finish off the Tasks-RCU grace period. */ +static void rcu_tasks_postgp(struct rcu_tasks *rtp) +{ + /* + * Because ->on_rq and ->nvcsw are not guaranteed to have a full + * memory barriers prior to them in the schedule() path, memory + * reordering on other CPUs could cause their RCU-tasks read-side + * critical sections to extend past the end of the grace period. + * However, because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_rcu() to force the + * needed ordering on all such CPUs. + * + * This synchronize_rcu() also confines all ->rcu_tasks_holdout + * accesses to be within the grace period, avoiding the need for + * memory barriers for ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_rcu() waits for exiting tasks + * to complete their final preempt_disable() region of execution, + * cleaning up after the synchronize_srcu() above. + */ + synchronize_rcu(); +} + +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); + +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, + * or transition to usermode execution. As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks); + +/** + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function + * preambles and profiling hooks. The synchronize_rcu_tasks() function + * is not (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); + +/** + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks); + +static int __init rcu_spawn_tasks_kthread(void) +{ + rcu_tasks.pregp_func = rcu_tasks_pregp_step; + rcu_tasks.pertask_func = rcu_tasks_pertask; + rcu_tasks.postscan_func = rcu_tasks_postscan; + rcu_tasks.holdouts_func = check_all_holdout_tasks; + rcu_tasks.postgp_func = rcu_tasks_postgp; + rcu_spawn_tasks_kthread_generic(&rcu_tasks); + return 0; +} +core_initcall(rcu_spawn_tasks_kthread); + +static void show_rcu_tasks_classic_gp_kthread(void) +{ + show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); +} + +/* Do the srcu_read_lock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) +{ + preempt_disable(); + current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + preempt_enable(); +} + +/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) +{ + struct task_struct *t = current; + + preempt_disable(); + __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); + preempt_enable(); + exit_tasks_rcu_finish_trace(t); +} + +#else /* #ifdef CONFIG_TASKS_RCU */ +static void show_rcu_tasks_classic_gp_kthread(void) { } +void exit_tasks_rcu_start(void) { } +void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + +#ifdef CONFIG_TASKS_RUDE_RCU + +//////////////////////////////////////////////////////////////////////// +// +// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of +// passing an empty function to schedule_on_each_cpu(). This approach +// provides an asynchronous call_rcu_tasks_rude() API and batching +// of concurrent calls to the synchronous synchronize_rcu_rude() API. +// This sends IPIs far and wide and induces otherwise unnecessary context +// switches on all online CPUs, whether idle or not. + +// Empty function to allow workqueues to force a context switch. +static void rcu_tasks_be_rude(struct work_struct *work) +{ +} + +// Wait for one rude RCU-tasks grace period. +static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) +{ + rtp->n_ipis += cpumask_weight(cpu_online_mask); + schedule_on_each_cpu(rcu_tasks_be_rude); +} + +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, + "RCU Tasks Rude"); + +/** + * call_rcu_tasks_rude() - Queue a callback rude task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_rude() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_rude); + +/** + * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period + * + * Control will return to the caller some time after a rude rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_rude() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_rude(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); + +/** + * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_rude(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_rude(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); + +static int __init rcu_spawn_tasks_rude_kthread(void) +{ + rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); + return 0; +} +core_initcall(rcu_spawn_tasks_rude_kthread); + +static void show_rcu_tasks_rude_gp_kthread(void) +{ + show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); +} + +#else /* #ifdef CONFIG_TASKS_RUDE_RCU */ +static void show_rcu_tasks_rude_gp_kthread(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RUDE_RCU */ + +//////////////////////////////////////////////////////////////////////// +// +// Tracing variant of Tasks RCU. This variant is designed to be used +// to protect tracing hooks, including those of BPF. This variant +// therefore: +// +// 1. Has explicit read-side markers to allow finite grace periods +// in the face of in-kernel loops for PREEMPT=n builds. +// +// 2. Protects code in the idle loop, exception entry/exit, and +// CPU-hotplug code paths, similar to the capabilities of SRCU. +// +// 3. Avoids expensive read-side instruction, having overhead similar +// to that of Preemptible RCU. +// +// There are of course downsides. The grace-period code can send IPIs to +// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. +// It is necessary to scan the full tasklist, much as for Tasks RCU. There +// is a single callback queue guarded by a single lock, again, much as for +// Tasks RCU. If needed, these downsides can be at least partially remedied. +// +// Perhaps most important, this variant of RCU does not affect the vanilla +// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace +// readers can operate from idle, offline, and exception entry/exit in no +// way allows rcu_preempt and rcu_sched readers to also do so. + +// The lockdep state must be outside of #ifdef to be useful. +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_trace_key; +struct lockdep_map rcu_trace_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_trace", &rcu_lock_trace_key); +EXPORT_SYMBOL_GPL(rcu_trace_lock_map); +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +atomic_t trc_n_readers_need_end; // Number of waited-for readers. +DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. + +// Record outstanding IPIs to each CPU. No point in sending two... +static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); + +// The number of detections of task quiescent state relying on +// heavyweight readers executing explicit memory barriers. +unsigned long n_heavy_reader_attempts; +unsigned long n_heavy_reader_updates; +unsigned long n_heavy_reader_ofl_updates; + +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, + "RCU Tasks Trace"); + +/* + * This irq_work handler allows rcu_read_unlock_trace() to be invoked + * while the scheduler locks are held. + */ +static void rcu_read_unlock_iw(struct irq_work *iwp) +{ + wake_up(&trc_wait); +} +static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); + +/* If we are the last reader, wake up the grace-period kthread. */ +void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) +{ + int nq = t->trc_reader_special.b.need_qs; + + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && + t->trc_reader_special.b.need_mb) + smp_mb(); // Pairs with update-side barriers. + // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. + if (nq) + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + WRITE_ONCE(t->trc_reader_nesting, nesting); + if (nq && atomic_dec_and_test(&trc_n_readers_need_end)) + irq_work_queue(&rcu_tasks_trace_iw); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); + +/* Add a task to the holdout list, if it is not already on the list. */ +static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) +{ + if (list_empty(&t->trc_holdout_list)) { + get_task_struct(t); + list_add(&t->trc_holdout_list, bhp); + } +} + +/* Remove a task from the holdout list, if it is in fact present. */ +static void trc_del_holdout(struct task_struct *t) +{ + if (!list_empty(&t->trc_holdout_list)) { + list_del_init(&t->trc_holdout_list); + put_task_struct(t); + } +} + +/* IPI handler to check task state. */ +static void trc_read_check_handler(void *t_in) +{ + struct task_struct *t = current; + struct task_struct *texp = t_in; + + // If the task is no longer running on this CPU, leave. + if (unlikely(texp != t)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + goto reset_ipi; // Already on holdout list, so will check later. + } + + // If the task is not in a read-side critical section, and + // if this is the last reader, awaken the grace-period kthread. + if (likely(!t->trc_reader_nesting)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + // Mark as checked after decrement to avoid false + // positives on the above WARN_ON_ONCE(). + WRITE_ONCE(t->trc_reader_checked, true); + goto reset_ipi; + } + WRITE_ONCE(t->trc_reader_checked, true); + + // Get here if the task is in a read-side critical section. Set + // its state so that it will awaken the grace-period kthread upon + // exit from that critical section. + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); + +reset_ipi: + // Allow future IPIs to be sent on CPU and for task. + // Also order this IPI handler against any later manipulations of + // the intended task. + smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ + smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ +} + +/* Callback function for scheduler to check locked-down task. */ +static bool trc_inspect_reader(struct task_struct *t, void *arg) +{ + int cpu = task_cpu(t); + bool in_qs = false; + bool ofl = cpu_is_offline(cpu); + + if (task_curr(t)) { + WARN_ON_ONCE(ofl & !is_idle_task(t)); + + // If no chance of heavyweight readers, do it the hard way. + if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + return false; + + // If heavyweight readers are enabled on the remote task, + // we can inspect its state despite its currently running. + // However, we cannot safely change its state. + n_heavy_reader_attempts++; + if (!ofl && // Check for "running" idle tasks on offline CPUs. + !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) + return false; // No quiescent state, do it the hard way. + n_heavy_reader_updates++; + if (ofl) + n_heavy_reader_ofl_updates++; + in_qs = true; + } else { + in_qs = likely(!t->trc_reader_nesting); + } + + // Mark as checked. Because this is called from the grace-period + // kthread, also remove the task from the holdout list. + t->trc_reader_checked = true; + trc_del_holdout(t); + + if (in_qs) + return true; // Already in quiescent state, done!!! + + // The task is in a read-side critical section, so set up its + // state so that it will awaken the grace-period kthread upon exit + // from that critical section. + atomic_inc(&trc_n_readers_need_end); // One more to wait on. + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); + return true; +} + +/* Attempt to extract the state for the specified task. */ +static void trc_wait_for_one_reader(struct task_struct *t, + struct list_head *bhp) +{ + int cpu; + + // If a previous IPI is still in flight, let it complete. + if (smp_load_acquire(&t->trc_ipi_to_cpu) != -1) // Order IPI + return; + + // The current task had better be in a quiescent state. + if (t == current) { + t->trc_reader_checked = true; + trc_del_holdout(t); + WARN_ON_ONCE(t->trc_reader_nesting); + return; + } + + // Attempt to nail down the task for inspection. + get_task_struct(t); + if (try_invoke_on_locked_down_task(t, trc_inspect_reader, NULL)) { + put_task_struct(t); + return; + } + put_task_struct(t); + + // If currently running, send an IPI, either way, add to list. + trc_add_holdout(t, bhp); + if (task_curr(t) && time_after(jiffies, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { + // The task is currently running, so try IPIing it. + cpu = task_cpu(t); + + // If there is already an IPI outstanding, let it happen. + if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0) + return; + + atomic_inc(&trc_n_readers_need_end); + per_cpu(trc_ipi_to_cpu, cpu) = true; + t->trc_ipi_to_cpu = cpu; + rcu_tasks_trace.n_ipis++; + if (smp_call_function_single(cpu, + trc_read_check_handler, t, 0)) { + // Just in case there is some other reason for + // failure than the target CPU being offline. + rcu_tasks_trace.n_ipis_fails++; + per_cpu(trc_ipi_to_cpu, cpu) = false; + t->trc_ipi_to_cpu = cpu; + if (atomic_dec_and_test(&trc_n_readers_need_end)) { + WARN_ON_ONCE(1); + wake_up(&trc_wait); + } + } + } +} + +/* Initialize for a new RCU-tasks-trace grace period. */ +static void rcu_tasks_trace_pregp_step(void) +{ + int cpu; + + // Allow for fast-acting IPIs. + atomic_set(&trc_n_readers_need_end, 1); + + // There shouldn't be any old IPIs, but... + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); + + // Disable CPU hotplug across the tasklist scan. + // This also waits for all readers in CPU-hotplug code paths. + cpus_read_lock(); +} + +/* Do first-round processing for the specified task. */ +static void rcu_tasks_trace_pertask(struct task_struct *t, + struct list_head *hop) +{ + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + WRITE_ONCE(t->trc_reader_checked, false); + t->trc_ipi_to_cpu = -1; + trc_wait_for_one_reader(t, hop); +} + +/* + * Do intermediate processing between task and holdout scans and + * pick up the idle tasks. + */ +static void rcu_tasks_trace_postscan(struct list_head *hop) +{ + int cpu; + + for_each_possible_cpu(cpu) + rcu_tasks_trace_pertask(idle_task(cpu), hop); + + // Re-enable CPU hotplug now that the tasklist scan has completed. + cpus_read_unlock(); + + // Wait for late-stage exiting tasks to finish exiting. + // These might have passed the call to exit_tasks_rcu_finish(). + synchronize_rcu(); + // Any tasks that exit after this point will set ->trc_reader_checked. +} + +/* Show the state of a task stalling the current RCU tasks trace GP. */ +static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) +{ + int cpu; + + if (*firstreport) { + pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n"); + *firstreport = false; + } + // FIXME: This should attempt to use try_invoke_on_nonrunning_task(). + cpu = task_cpu(t); + pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", + t->pid, + ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0], + ".i"[is_idle_task(t)], + ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], + t->trc_reader_nesting, + " N"[!!t->trc_reader_special.b.need_qs], + cpu); + sched_show_task(t); +} + +/* List stalled IPIs for RCU tasks trace. */ +static void show_stalled_ipi_trace(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + if (per_cpu(trc_ipi_to_cpu, cpu)) + pr_alert("\tIPI outstanding to CPU %d\n", cpu); +} + +/* Do one scan of the holdout list. */ +static void check_all_holdout_tasks_trace(struct list_head *hop, + bool needreport, bool *firstreport) +{ + struct task_struct *g, *t; + + // Disable CPU hotplug across the holdout list scan. + cpus_read_lock(); + + list_for_each_entry_safe(t, g, hop, trc_holdout_list) { + // If safe and needed, try to check the current task. + if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && + !READ_ONCE(t->trc_reader_checked)) + trc_wait_for_one_reader(t, hop); + + // If check succeeded, remove this task from the list. + if (READ_ONCE(t->trc_reader_checked)) + trc_del_holdout(t); + else if (needreport) + show_stalled_task_trace(t, firstreport); + } + + // Re-enable CPU hotplug now that the holdout list scan has completed. + cpus_read_unlock(); + + if (needreport) { + if (firstreport) + pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n"); + show_stalled_ipi_trace(); + } +} + +/* Wait for grace period to complete and provide ordering. */ +static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) +{ + bool firstreport; + struct task_struct *g, *t; + LIST_HEAD(holdouts); + long ret; + + // Remove the safety count. + smp_mb__before_atomic(); // Order vs. earlier atomics + atomic_dec(&trc_n_readers_need_end); + smp_mb__after_atomic(); // Order vs. later atomics + + // Wait for readers. + set_tasks_gp_state(rtp, RTGS_WAIT_READERS); + for (;;) { + ret = wait_event_idle_exclusive_timeout( + trc_wait, + atomic_read(&trc_n_readers_need_end) == 0, + READ_ONCE(rcu_task_stall_timeout)); + if (ret) + break; // Count reached zero. + // Stall warning time, so make a list of the offenders. + for_each_process_thread(g, t) + if (READ_ONCE(t->trc_reader_special.b.need_qs)) + trc_add_holdout(t, &holdouts); + firstreport = true; + list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) + if (READ_ONCE(t->trc_reader_special.b.need_qs)) { + show_stalled_task_trace(t, &firstreport); + trc_del_holdout(t); + } + if (firstreport) + pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n"); + show_stalled_ipi_trace(); + pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end)); + } + smp_mb(); // Caller's code must be ordered after wakeup. + // Pairs with pretty much every ordering primitive. +} + +/* Report any needed quiescent state for this exiting task. */ +static void exit_tasks_rcu_finish_trace(struct task_struct *t) +{ + WRITE_ONCE(t->trc_reader_checked, true); + WARN_ON_ONCE(t->trc_reader_nesting); + WRITE_ONCE(t->trc_reader_nesting, 0); + if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) + rcu_read_unlock_trace_special(t, 0); +} + +/** + * call_rcu_tasks_trace() - Queue a callback trace task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_trace() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); + +/** + * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period + * + * Control will return to the caller some time after a trace rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_trace() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_trace(void) +{ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_trace_lock_map), "Illegal synchronize_rcu_tasks_trace() in RCU Tasks Trace read-side critical section"); + synchronize_rcu_tasks_generic(&rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace); + +/** + * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_trace(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_trace(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); + +static int __init rcu_spawn_tasks_trace_kthread(void) +{ + rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; + rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask; + rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; + rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; + rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp; + rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace); + return 0; +} +core_initcall(rcu_spawn_tasks_trace_kthread); + +static void show_rcu_tasks_trace_gp_kthread(void) +{ + char buf[64]; + + sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end), + data_race(n_heavy_reader_ofl_updates), + data_race(n_heavy_reader_updates), + data_race(n_heavy_reader_attempts)); + show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); +} + +#else /* #ifdef CONFIG_TASKS_TRACE_RCU */ +static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } +static inline void show_rcu_tasks_trace_gp_kthread(void) {} +#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ + +void show_rcu_tasks_gp_kthreads(void) +{ + show_rcu_tasks_classic_gp_kthread(); + show_rcu_tasks_rude_gp_kthread(); + show_rcu_tasks_trace_gp_kthread(); +} + +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void rcu_tasks_bootup_oddness(void) {} +void show_rcu_tasks_gp_kthreads(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 550193a9ce76..f288477ee1c2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -67,6 +67,19 @@ #endif #define MODULE_PARAM_PREFIX "rcutree." +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + /* Data structures. */ /* @@ -100,7 +113,7 @@ static struct rcu_state rcu_state = { static bool dump_tree; module_param(dump_tree, bool, 0444); /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ -static bool use_softirq = 1; +static bool use_softirq = true; module_param(use_softirq, bool, 0444); /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; @@ -225,7 +238,9 @@ void rcu_softirq_qs(void) /* * Record entry into an extended quiescent state. This is only to be - * called when not already in an extended quiescent state. + * called when not already in an extended quiescent state, that is, + * RCU is watching prior to the call to this function and is no longer + * watching upon return. */ static void rcu_dynticks_eqs_enter(void) { @@ -237,8 +252,9 @@ static void rcu_dynticks_eqs_enter(void) * critical sections, and we also must force ordering with the * next idle sojourn. */ + rcu_dynticks_task_trace_enter(); // Before ->dynticks update! seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); - /* Better be in an extended quiescent state! */ + // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICK_CTRL_CTR)); /* Better not have special action (TLB flush) pending! */ @@ -248,7 +264,8 @@ static void rcu_dynticks_eqs_enter(void) /* * Record exit from an extended quiescent state. This is only to be - * called from an extended quiescent state. + * called from an extended quiescent state, that is, RCU is not watching + * prior to the call to this function and is watching upon return. */ static void rcu_dynticks_eqs_exit(void) { @@ -261,6 +278,8 @@ static void rcu_dynticks_eqs_exit(void) * critical section. */ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + // RCU is now watching. Better not be in an extended quiescent state! + rcu_dynticks_task_trace_exit(); // After ->dynticks update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICK_CTRL_CTR)); if (seq & RCU_DYNTICK_CTRL_MASK) { @@ -333,6 +352,28 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) } /* + * Return true if the referenced integer is zero while the specified + * CPU remains within a single extended quiescent state. + */ +bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int snap; + + // If not quiescent, force back to earlier extended quiescent state. + snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK | + RCU_DYNTICK_CTRL_CTR); + + smp_rmb(); // Order ->dynticks and *vp reads. + if (READ_ONCE(*vp)) + return false; // Non-zero, so report failure; + smp_rmb(); // Order *vp read and ->dynticks re-read. + + // If still in the same extended quiescent state, we are good! + return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK); +} + +/* * Set the special (bottom) bit of the specified CPU so that it * will take special action (such as flushing its TLB) on the * next exit from an extended quiescent state. Returns true if @@ -571,6 +612,7 @@ static void rcu_eqs_enter(bool user) WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && rdp->dynticks_nesting == 0); if (rdp->dynticks_nesting != 1) { + // RCU will still be watching, so just do accounting and leave. rdp->dynticks_nesting--; return; } @@ -583,7 +625,9 @@ static void rcu_eqs_enter(bool user) rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + // RCU is watching here ... rcu_dynticks_eqs_enter(); + // ... but is no longer watching here. rcu_dynticks_task_enter(); } @@ -663,7 +707,9 @@ static __always_inline void rcu_nmi_exit_common(bool irq) if (irq) rcu_prepare_for_idle(); + // RCU is watching here ... rcu_dynticks_eqs_enter(); + // ... but is no longer watching here. if (irq) rcu_dynticks_task_enter(); @@ -738,11 +784,14 @@ static void rcu_eqs_exit(bool user) oldval = rdp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { + // RCU was already watching, so just do accounting and leave. rdp->dynticks_nesting++; return; } rcu_dynticks_task_exit(); + // RCU is not watching here ... rcu_dynticks_eqs_exit(); + // ... but is watching here. rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); @@ -819,19 +868,28 @@ static __always_inline void rcu_nmi_enter_common(bool irq) if (irq) rcu_dynticks_task_exit(); + // RCU is not watching here ... rcu_dynticks_eqs_exit(); + // ... but is watching here. if (irq) rcu_cleanup_after_idle(); incby = 1; - } else if (tick_nohz_full_cpu(rdp->cpu) && + } else if (irq && tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && READ_ONCE(rdp->rcu_urgent_qs) && !READ_ONCE(rdp->rcu_forced_tick)) { + // We get here only if we had already exited the extended + // quiescent state and this was an interrupt (not an NMI). + // Therefore, (1) RCU is already watching and (2) The fact + // that we are in an interrupt handler and that the rcu_node + // lock is an irq-disabled lock prevents self-deadlock. + // So we can safely recheck under the lock. raw_spin_lock_rcu_node(rdp->mynode); - // Recheck under lock. if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + // A nohz_full CPU is in the kernel and RCU + // needs a quiescent state. Turn on the tick! WRITE_ONCE(rdp->rcu_forced_tick, true); tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); } @@ -1124,6 +1182,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && (rnp->ffmask & rdp->grpmask)) { init_irq_work(&rdp->rcu_iw, rcu_iw_handler); + atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ); rdp->rcu_iw_pending = true; rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); @@ -1216,7 +1275,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ @@ -1472,6 +1531,31 @@ static void rcu_gp_slow(int delay) schedule_timeout_uninterruptible(delay); } +static unsigned long sleep_duration; + +/* Allow rcutorture to stall the grace-period kthread. */ +void rcu_gp_set_torture_wait(int duration) +{ + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > 0) + WRITE_ONCE(sleep_duration, duration); +} +EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait); + +/* Actually implement the aforementioned wait. */ +static void rcu_gp_torture_wait(void) +{ + unsigned long duration; + + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST)) + return; + duration = xchg(&sleep_duration, 0UL); + if (duration > 0) { + pr_alert("%s: Waiting %lu jiffies\n", __func__, duration); + schedule_timeout_uninterruptible(duration); + pr_alert("%s: Wait complete\n", __func__); + } +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1505,6 +1589,7 @@ static bool rcu_gp_init(void) record_gp_stall_check_time(); /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); @@ -1610,12 +1695,16 @@ static bool rcu_gp_fqs_check_wake(int *gfp) { struct rcu_node *rnp = rcu_get_root(); - /* Someone like call_rcu() requested a force-quiescent-state scan. */ + // If under overload conditions, force an immediate FQS scan. + if (*gfp & RCU_GP_FLAG_OVLD) + return true; + + // Someone like call_rcu() requested a force-quiescent-state scan. *gfp = READ_ONCE(rcu_state.gp_flags); if (*gfp & RCU_GP_FLAG_FQS) return true; - /* The current grace period has completed. */ + // The current grace period has completed. if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) return true; @@ -1653,13 +1742,15 @@ static void rcu_gp_fqs(bool first_time) static void rcu_gp_fqs_loop(void) { bool first_gp_fqs; - int gf; + int gf = 0; unsigned long j; int ret; struct rcu_node *rnp = rcu_get_root(); first_gp_fqs = true; j = READ_ONCE(jiffies_till_first_fqs); + if (rcu_state.cbovld) + gf = RCU_GP_FLAG_OVLD; ret = 0; for (;;) { if (!ret) { @@ -1672,6 +1763,7 @@ static void rcu_gp_fqs_loop(void) rcu_state.gp_state = RCU_GP_WAIT_FQS; ret = swait_event_idle_timeout_exclusive( rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); + rcu_gp_torture_wait(); rcu_state.gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ @@ -1679,12 +1771,16 @@ static void rcu_gp_fqs_loop(void) !rcu_preempt_blocked_readers_cgp(rnp)) break; /* If time for quiescent-state forcing, do it. */ - if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) || + if (!time_after(rcu_state.jiffies_force_qs, jiffies) || (gf & RCU_GP_FLAG_FQS)) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); - first_gp_fqs = false; + gf = 0; + if (first_gp_fqs) { + first_gp_fqs = false; + gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : 0; + } trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsend")); cond_resched_tasks_rcu_qs(); @@ -1704,6 +1800,7 @@ static void rcu_gp_fqs_loop(void) j = 1; else j = rcu_state.jiffies_force_qs - j; + gf = 0; } } } @@ -1780,6 +1877,7 @@ static void rcu_gp_cleanup(void) /* Declare grace period done, trace first to use old GP number. */ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); rcu_seq_end(&rcu_state.gp_seq); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); rcu_state.gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(&rcu_data); @@ -1820,6 +1918,7 @@ static int __noreturn rcu_gp_kthread(void *unused) swait_event_idle_exclusive(rcu_state.gp_wq, READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_INIT); + rcu_gp_torture_wait(); rcu_state.gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init()) @@ -2810,6 +2909,8 @@ struct kfree_rcu_cpu { struct delayed_work monitor_work; bool monitor_todo; bool initialized; + // Number of objects for which GP not started + int count; }; static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); @@ -2923,6 +3024,8 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) krcp->head = NULL; } + WRITE_ONCE(krcp->count, 0); + /* * One work is per one batch, so there are two "free channels", * "bhead_free" and "head_free" the batch can handle. It can be @@ -3059,6 +3162,8 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) krcp->head = head; } + WRITE_ONCE(krcp->count, krcp->count + 1); + // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !krcp->monitor_todo) { @@ -3073,6 +3178,56 @@ unlock_return: } EXPORT_SYMBOL_GPL(kfree_call_rcu); +static unsigned long +kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu; + unsigned long count = 0; + + /* Snapshot count of all CPUs */ + for_each_online_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count += READ_ONCE(krcp->count); + } + + return count; +} + +static unsigned long +kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu, freed = 0; + unsigned long flags; + + for_each_online_cpu(cpu) { + int count; + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count = krcp->count; + spin_lock_irqsave(&krcp->lock, flags); + if (krcp->monitor_todo) + kfree_rcu_drain_unlock(krcp, flags); + else + spin_unlock_irqrestore(&krcp->lock, flags); + + sc->nr_to_scan -= count; + freed += count; + + if (sc->nr_to_scan <= 0) + break; + } + + return freed; +} + +static struct shrinker kfree_rcu_shrinker = { + .count_objects = kfree_rcu_shrink_count, + .scan_objects = kfree_rcu_shrink_scan, + .batch = 0, + .seeks = DEFAULT_SEEKS, +}; + void __init kfree_rcu_scheduler_running(void) { int cpu; @@ -3598,6 +3753,7 @@ void rcu_cpu_starting(unsigned int cpu) nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); @@ -3993,6 +4149,8 @@ static void __init kfree_rcu_batch_init(void) INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } + if (register_shrinker(&kfree_rcu_shrinker)) + pr_err("Failed to register kfree_rcu() shrinker!\n"); } void __init rcu_init(void) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9dc2ec021da5..43991a40b084 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -359,6 +359,7 @@ struct rcu_state { /* Values for rcu_state structure's gp_flags field. */ #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ +#define RCU_GP_FLAG_OVLD 0x4 /* Experiencing callback overload. */ /* Values for rcu_state structure's gp_state field. */ #define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ @@ -454,6 +455,8 @@ static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); +static void rcu_dynticks_task_trace_enter(void); +static void rcu_dynticks_task_trace_exit(void); /* Forward declarations for tree_stall.h */ static void record_gp_stall_check_time(void); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1a617b9dffb0..72952edad1e4 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -150,7 +150,7 @@ static void __maybe_unused sync_exp_reset_tree(void) static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); - return rnp->exp_tasks == NULL && + return READ_ONCE(rnp->exp_tasks) == NULL && READ_ONCE(rnp->expmask) == 0; } @@ -373,7 +373,7 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) * until such time as the ->expmask bits are cleared. */ if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; + WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ @@ -542,8 +542,8 @@ static void synchronize_rcu_expedited_wait(void) } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rcu_state.expedited_sequence, - READ_ONCE(rnp_root->expmask), - ".T"[!!rnp_root->exp_tasks]); + data_race(rnp_root->expmask), + ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rnp) { @@ -553,8 +553,8 @@ static void synchronize_rcu_expedited_wait(void) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, - READ_ONCE(rnp->expmask), - ".T"[!!rnp->exp_tasks]); + data_race(rnp->expmask), + ".T"[!!data_race(rnp->exp_tasks)]); } pr_cont("\n"); } @@ -639,6 +639,7 @@ static void wait_rcu_exp_gp(struct work_struct *wp) */ static void rcu_exp_handler(void *unused) { + int depth = rcu_preempt_depth(); unsigned long flags; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; @@ -649,7 +650,7 @@ static void rcu_exp_handler(void *unused) * critical section. If also enabled or idle, immediately * report the quiescent state, otherwise defer. */ - if (!rcu_preempt_depth()) { + if (!depth) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); @@ -673,7 +674,7 @@ static void rcu_exp_handler(void *unused) * can have caused this quiescent state to already have been * reported, so we really do need to check ->expmask. */ - if (rcu_preempt_depth() > 0) { + if (depth > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { rdp->exp_deferred_qs = true; @@ -683,30 +684,8 @@ static void rcu_exp_handler(void *unused) return; } - /* - * The final and least likely case is where the interrupted - * code was just about to or just finished exiting the RCU-preempt - * read-side critical section, and no, we can't tell which. - * So either way, set ->deferred_qs to flag later code that - * a quiescent state is required. - * - * If the CPU is fully enabled (or if some buggy RCU-preempt - * read-side critical section is being used from idle), just - * invoke rcu_preempt_deferred_qs() to immediately report the - * quiescent state. We cannot use rcu_read_unlock_special() - * because we are in an interrupt handler, which will cause that - * function to take an early exit without doing anything. - * - * Otherwise, force a context switch after the CPU enables everything. - */ - rdp->exp_deferred_qs = true; - if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { - rcu_preempt_deferred_qs(t); - } else { - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + // Finally, negative nesting depth should not happen. + WARN_ON_ONCE(1); } /* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ @@ -721,17 +700,20 @@ static void sync_sched_exp_online_cleanup(int cpu) */ static int rcu_print_task_exp_stall(struct rcu_node *rnp) { - struct task_struct *t; + unsigned long flags; int ndetected = 0; + struct task_struct *t; - if (!rnp->exp_tasks) + if (!READ_ONCE(rnp->exp_tasks)) return 0; + raw_spin_lock_irqsave_rcu_node(rnp, flags); t = list_entry(rnp->exp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { pr_cont(" P%d", t->pid); ndetected++; } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return ndetected; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 097635c41135..50caa3fcbad2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -226,7 +226,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) - rnp->exp_tasks = &t->rcu_node_entry; + WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != !(rnp->qsmask & rdp->grpmask)); WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != @@ -331,6 +331,7 @@ void rcu_note_context_switch(bool preempt) rcu_qs(); if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); + rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -345,9 +346,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) return READ_ONCE(rnp->gp_tasks) != NULL; } -/* Bias and limit values for ->rcu_read_lock_nesting. */ -#define RCU_NEST_BIAS INT_MAX -#define RCU_NEST_NMAX (-INT_MAX / 2) +/* limit value for ->rcu_read_lock_nesting. */ #define RCU_NEST_PMAX (INT_MAX / 2) static void rcu_preempt_read_enter(void) @@ -355,9 +354,9 @@ static void rcu_preempt_read_enter(void) current->rcu_read_lock_nesting++; } -static void rcu_preempt_read_exit(void) +static int rcu_preempt_read_exit(void) { - current->rcu_read_lock_nesting--; + return --current->rcu_read_lock_nesting; } static void rcu_preempt_depth_set(int val) @@ -390,21 +389,15 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; - if (rcu_preempt_depth() != 1) { - rcu_preempt_read_exit(); - } else { + if (rcu_preempt_read_exit() == 0) { barrier(); /* critical section before exit code. */ - rcu_preempt_depth_set(-RCU_NEST_BIAS); - barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - rcu_preempt_depth_set(0); } if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { int rrln = rcu_preempt_depth(); - WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); + WARN_ON_ONCE(rrln < 0 || rrln > RCU_NEST_PMAX); } } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -500,12 +493,12 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) if (&t->rcu_node_entry == rnp->gp_tasks) WRITE_ONCE(rnp->gp_tasks, np); if (&t->rcu_node_entry == rnp->exp_tasks) - rnp->exp_tasks = np; + WRITE_ONCE(rnp->exp_tasks, np); if (IS_ENABLED(CONFIG_RCU_BOOST)) { /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; + WRITE_ONCE(rnp->boost_tasks, np); } /* @@ -556,7 +549,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return (__this_cpu_read(rcu_data.exp_deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && - rcu_preempt_depth() <= 0; + rcu_preempt_depth() == 0; } /* @@ -569,16 +562,11 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) static void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; - bool couldrecurse = rcu_preempt_depth() >= 0; if (!rcu_preempt_need_deferred_qs(t)) return; - if (couldrecurse) - rcu_preempt_depth_set(rcu_preempt_depth() - RCU_NEST_BIAS); local_irq_save(flags); rcu_preempt_deferred_qs_irqrestore(t, flags); - if (couldrecurse) - rcu_preempt_depth_set(rcu_preempt_depth() + RCU_NEST_BIAS); } /* @@ -615,19 +603,18 @@ static void rcu_read_unlock_special(struct task_struct *t) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || - (rdp->grpmask & READ_ONCE(rnp->expmask)) || - tick_nohz_full_cpu(rdp->cpu); + exp = (t->rcu_blocked_node && + READ_ONCE(t->rcu_blocked_node->exp_tasks)) || + (rdp->grpmask & READ_ONCE(rnp->expmask)); // Need to defer quiescent state until everything is enabled. - if (irqs_were_disabled && use_softirq && - (in_interrupt() || - (exp && !t->rcu_read_unlock_special.b.deferred_qs))) { - // Using softirq, safe to awaken, and we get - // no help from enabling irqs, unlike bh/preempt. + if (use_softirq && (in_irq() || (exp && !irqs_were_disabled))) { + // Using softirq, safe to awaken, and either the + // wakeup is free or there is an expedited GP. raise_softirq_irqoff(RCU_SOFTIRQ); } else { // Enabling BH or preempt does reschedule, so... - // Also if no expediting or NO_HZ_FULL, slow is OK. + // Also if no expediting, slow is OK. + // Plus nohz_full CPUs eventually get tick enabled. set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && @@ -640,7 +627,6 @@ static void rcu_read_unlock_special(struct task_struct *t) irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } - t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); return; } @@ -699,7 +685,7 @@ static void rcu_flavor_sched_clock_irq(int user) } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; - } else if (!rcu_preempt_depth()) { + } else if (!WARN_ON_ONCE(rcu_preempt_depth())) { rcu_qs(); /* Report immediate QS. */ return; } @@ -760,8 +746,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", - __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks, - rnp->exp_tasks); + __func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks), + READ_ONCE(rnp->exp_tasks)); pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { @@ -854,8 +840,7 @@ void rcu_note_context_switch(bool preempt) this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); - if (!preempt) - rcu_tasks_qs(current); + rcu_tasks_qs(current, preempt); out: trace_rcu_utilization(TPS("End context switch")); } @@ -1036,7 +1021,8 @@ static int rcu_boost_kthread(void *arg) for (;;) { WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING); trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); - rcu_wait(rnp->boost_tasks || rnp->exp_tasks); + rcu_wait(READ_ONCE(rnp->boost_tasks) || + READ_ONCE(rnp->exp_tasks)); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING); more2boost = rcu_boost(rnp); @@ -1079,9 +1065,9 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - (ULONG_CMP_GE(jiffies, rnp->boost_time) || rcu_state.cbovld))) { + (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) { if (rnp->exp_tasks == NULL) - rnp->boost_tasks = rnp->gp_tasks; + WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_wake_cond(rnp->boost_kthread_task, READ_ONCE(rnp->boost_kthread_status)); @@ -2536,7 +2522,7 @@ static bool rcu_nohz_full_cpu(void) #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(smp_processor_id()) && (!rcu_gp_in_progress() || - ULONG_CMP_LT(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) + time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) return true; #endif /* #ifdef CONFIG_NO_HZ_FULL */ return false; @@ -2567,3 +2553,21 @@ static void rcu_dynticks_task_exit(void) WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } + +/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ +static void rcu_dynticks_task_trace_enter(void) +{ +#ifdef CONFIG_TASKS_RCU_TRACE + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = true; +#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +} + +/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ +static void rcu_dynticks_task_trace_exit(void) +{ +#ifdef CONFIG_TASKS_RCU_TRACE + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = false; +#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +} diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..ae76bd329582 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -15,10 +15,12 @@ int sysctl_panic_on_rcu_stall __read_mostly; #ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) +#define RCU_STALL_DELAY_DELTA (5 * HZ) #else -#define RCU_STALL_DELAY_DELTA 0 +#define RCU_STALL_DELAY_DELTA 0 #endif +#define RCU_STALL_MIGHT_DIV 8 +#define RCU_STALL_MIGHT_MIN (2 * HZ) /* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) @@ -40,6 +42,36 @@ int rcu_jiffies_till_stall_check(void) } EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); +/** + * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled? + * + * Returns @true if the current grace period is sufficiently old that + * it is reasonable to assume that it might be stalled. This can be + * useful when deciding whether to allocate memory to enable RCU-mediated + * freeing on the one hand or just invoking synchronize_rcu() on the other. + * The latter is preferable when the grace period is stalled. + * + * Note that sampling of the .gp_start and .gp_seq fields must be done + * carefully to avoid false positives at the beginnings and ends of + * grace periods. + */ +bool rcu_gp_might_be_stalled(void) +{ + unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV; + unsigned long j = jiffies; + + if (d < RCU_STALL_MIGHT_MIN) + d = RCU_STALL_MIGHT_MIN; + smp_mb(); // jiffies before .gp_seq to avoid false positives. + if (!rcu_gp_in_progress()) + return false; + // Long delays at this point avoids false positive, but a delay + // of ULONG_MAX/4 jiffies voids your no-false-positive warranty. + smp_mb(); // .gp_seq before second .gp_start + // And ditto here. + return !time_before(j, READ_ONCE(rcu_state.gp_start) + d); +} + /* Don't do RCU CPU stall warnings during long sysrq printouts. */ void rcu_sysrq_start(void) { @@ -104,8 +136,8 @@ static void record_gp_stall_check_time(void) WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); - /* Record ->gp_start before ->jiffies_stall. */ - smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ + smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq. + WRITE_ONCE(rcu_state.jiffies_stall, j + j1); rcu_state.jiffies_resched = j + j1 / 2; rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); } @@ -192,14 +224,40 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +// Communicate task state back to the RCU CPU stall warning request. +struct rcu_stall_chk_rdr { + int nesting; + union rcu_special rs; + bool on_blkd_list; +}; + +/* + * Report out the state of a not-running task that is stalling the + * current RCU grace period. + */ +static bool check_slow_task(struct task_struct *t, void *arg) +{ + struct rcu_node *rnp; + struct rcu_stall_chk_rdr *rscrp = arg; + + if (task_curr(t)) + return false; // It is running, so decline to inspect it. + rscrp->nesting = t->rcu_read_lock_nesting; + rscrp->rs = t->rcu_read_unlock_special; + rnp = t->rcu_blocked_node; + rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry); + return true; +} + /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. */ static int rcu_print_task_stall(struct rcu_node *rnp) { - struct task_struct *t; int ndetected = 0; + struct rcu_stall_chk_rdr rscr; + struct task_struct *t; if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; @@ -208,7 +266,15 @@ static int rcu_print_task_stall(struct rcu_node *rnp) t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - pr_cont(" P%d", t->pid); + if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr)) + pr_cont(" P%d", t->pid); + else + pr_cont(" P%d/%d:%c%c%c%c", + t->pid, rscr.nesting, + ".b"[rscr.rs.b.blocked], + ".q"[rscr.rs.b.need_qs], + ".e"[rscr.rs.b.exp_hint], + ".l"[rscr.on_blkd_list]); ndetected++; } pr_cont("\n"); @@ -299,6 +365,16 @@ static const char *gp_state_getname(short gs) return gp_state_names[gs]; } +/* Is the RCU grace-period kthread being starved of CPU time? */ +static bool rcu_is_gp_kthread_starving(unsigned long *jp) +{ + unsigned long j = jiffies - READ_ONCE(rcu_state.gp_activity); + + if (jp) + *jp = j; + return j > 2 * HZ; +} + /* * Print out diagnostic information for the specified stalled CPU. * @@ -313,6 +389,7 @@ static const char *gp_state_getname(short gs) static void print_cpu_stall_info(int cpu) { unsigned long delta; + bool falsepositive; char fast_no_hz[72]; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); char *ticks_title; @@ -333,7 +410,9 @@ static void print_cpu_stall_info(int cpu) } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", + falsepositive = rcu_is_gp_kthread_starving(NULL) && + rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -345,8 +424,9 @@ static void print_cpu_stall_info(int cpu) rcu_dynticks_snap(rdp) & 0xfff, rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), - READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, - fast_no_hz); + data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, + fast_no_hz, + falsepositive ? " (false positive?)" : ""); } /* Complain about starvation of grace-period kthread. */ @@ -355,15 +435,15 @@ static void rcu_check_gp_kthread_starvation(void) struct task_struct *gpk = rcu_state.gp_kthread; unsigned long j; - j = jiffies - READ_ONCE(rcu_state.gp_activity); - if (j > 2 * HZ) { + if (rcu_is_gp_kthread_starving(&j)) { pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), - READ_ONCE(rcu_state.gp_flags), + data_race(rcu_state.gp_flags), gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); if (gpk) { + pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); wake_up_process(gpk); @@ -371,7 +451,7 @@ static void rcu_check_gp_kthread_starvation(void) } } -static void print_other_cpu_stall(unsigned long gp_seq) +static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) { int cpu; unsigned long flags; @@ -408,7 +488,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", - smp_processor_id(), (long)(jiffies - rcu_state.gp_start), + smp_processor_id(), (long)(jiffies - gps), (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); if (ndetected) { rcu_dump_cpu_stacks(); @@ -421,13 +501,11 @@ static void print_other_cpu_stall(unsigned long gp_seq) pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; - gpa = READ_ONCE(rcu_state.gp_activity); + gpa = data_race(rcu_state.gp_activity); pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", rcu_state.name, j - gpa, j, gpa, - READ_ONCE(jiffies_till_next_fqs), + data_race(jiffies_till_next_fqs), rcu_get_root()->qsmask); - /* In this case, the current CPU might be at fault. */ - sched_show_task(current); } } /* Rewrite if needed in case of slow consoles. */ @@ -442,7 +520,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) rcu_force_quiescent_state(); /* Kick them all. */ } -static void print_cpu_stall(void) +static void print_cpu_stall(unsigned long gps) { int cpu; unsigned long flags; @@ -467,7 +545,7 @@ static void print_cpu_stall(void) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", - jiffies - rcu_state.gp_start, + jiffies - gps, (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); rcu_check_gp_kthread_starvation(); @@ -546,7 +624,7 @@ static void check_cpu_stall(struct rcu_data *rdp) cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* We haven't checked in, so go dump stack. */ - print_cpu_stall(); + print_cpu_stall(gps); if (rcu_cpu_stall_ftrace_dump) rcu_ftrace_dump(DUMP_ALL); @@ -555,7 +633,7 @@ static void check_cpu_stall(struct rcu_data *rdp) cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(gs2); + print_other_cpu_stall(gs2, gps); if (rcu_cpu_stall_ftrace_dump) rcu_ftrace_dump(DUMP_ALL); } @@ -581,23 +659,23 @@ void show_rcu_gp_kthreads(void) struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); j = jiffies; - ja = j - READ_ONCE(rcu_state.gp_activity); - jr = j - READ_ONCE(rcu_state.gp_req_activity); - jw = j - READ_ONCE(rcu_state.gp_wake_time); + ja = j - data_race(rcu_state.gp_activity); + jr = j - data_race(rcu_state.gp_req_activity); + jw = j - data_race(rcu_state.gp_wake_time); pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, t ? t->state : 0x1ffffL, - ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), - (long)READ_ONCE(rcu_state.gp_seq), - (long)READ_ONCE(rcu_get_root()->gp_seq_needed), - READ_ONCE(rcu_state.gp_flags)); + ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), + (long)data_race(rcu_state.gp_seq), + (long)data_race(rcu_get_root()->gp_seq_needed), + data_race(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed))) continue; pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", - rnp->grplo, rnp->grphi, (long)READ_ONCE(rnp->gp_seq), - (long)READ_ONCE(rnp->gp_seq_needed)); + rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq), + (long)data_race(rnp->gp_seq_needed)); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { @@ -607,7 +685,7 @@ void show_rcu_gp_kthreads(void) READ_ONCE(rdp->gp_seq_needed))) continue; pr_info("\tcpu %d ->gp_seq_needed %ld\n", - cpu, (long)READ_ONCE(rdp->gp_seq_needed)); + cpu, (long)data_race(rdp->gp_seq_needed)); } } for_each_possible_cpu(cpu) { @@ -615,7 +693,7 @@ void show_rcu_gp_kthreads(void) if (rcu_segcblist_is_offloaded(&rdp->cblist)) show_rcu_nocb_state(rdp); } - /* sched_show_task(rcu_state.gp_kthread); */ + show_rcu_tasks_gp_kthreads(); } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index a4ad8e0406c7..3ce63a91d956 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -41,6 +41,7 @@ #include <linux/sched/isolation.h> #include <linux/kprobes.h> #include <linux/slab.h> +#include <linux/irq_work.h> #define CREATE_TRACE_POINTS @@ -51,6 +52,19 @@ #endif #define MODULE_PARAM_PREFIX "rcupdate." +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); @@ -63,12 +77,12 @@ module_param(rcu_normal_after_boot, int, 0); * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section? * @ret: Best guess answer if lockdep cannot be relied on * - * Returns true if lockdep must be ignored, in which case *ret contains + * Returns true if lockdep must be ignored, in which case ``*ret`` contains * the best guess described below. Otherwise returns false, in which - * case *ret tells the caller nothing and the caller should instead + * case ``*ret`` tells the caller nothing and the caller should instead * consult lockdep. * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an + * If CONFIG_DEBUG_LOCK_ALLOC is selected, set ``*ret`` to nonzero iff in an * RCU-sched read-side critical section. In absence of * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. Note that disabling @@ -82,7 +96,7 @@ module_param(rcu_normal_after_boot, int, 0); * * Note that if the CPU is in the idle loop from an RCU point of view (ie: * that we are in the section between rcu_idle_enter() and rcu_idle_exit()) - * then rcu_read_lock_held() sets *ret to false even if the CPU did an + * then rcu_read_lock_held() sets ``*ret`` to false even if the CPU did an * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are * in such a section, considering these as in extended quiescent state, * so such a CPU is effectively never in an RCU read-side critical section @@ -98,15 +112,15 @@ module_param(rcu_normal_after_boot, int, 0); static bool rcu_read_lock_held_common(bool *ret) { if (!debug_lockdep_rcu_enabled()) { - *ret = 1; + *ret = true; return true; } if (!rcu_is_watching()) { - *ret = 0; + *ret = false; return true; } if (!rcu_lockdep_current_cpu_online()) { - *ret = 0; + *ret = false; return true; } return false; @@ -239,18 +253,30 @@ core_initcall(rcu_set_runtime_mode); #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); +struct lockdep_map rcu_lock_map = { + .name = "rcu_read_lock", + .key = &rcu_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* XXX PREEMPT_RCU ? */ +}; EXPORT_SYMBOL_GPL(rcu_lock_map); static struct lock_class_key rcu_bh_lock_key; -struct lockdep_map rcu_bh_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); +struct lockdep_map rcu_bh_lock_map = { + .name = "rcu_read_lock_bh", + .key = &rcu_bh_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_LOCK also makes BH preemptible */ +}; EXPORT_SYMBOL_GPL(rcu_bh_lock_map); static struct lock_class_key rcu_sched_lock_key; -struct lockdep_map rcu_sched_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); +struct lockdep_map rcu_sched_lock_map = { + .name = "rcu_read_lock_sched", + .key = &rcu_sched_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_SPIN, +}; EXPORT_SYMBOL_GPL(rcu_sched_lock_map); static struct lock_class_key rcu_callback_key; @@ -489,370 +515,6 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot); module_param(rcu_cpu_stall_suppress_at_boot, int, 0444); -#ifdef CONFIG_TASKS_RCU - -/* - * Simple variant of RCU whose quiescent states are voluntary context - * switch, cond_resched_rcu_qs(), user-space execution, and idle. - * As such, grace periods can take one good long time. There are no - * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() - * because this implementation is intended to get the system into a safe - * state for some of the manipulations involved in tracing and the like. - * Finally, this implementation does not support high call_rcu_tasks() - * rates from multiple CPUs. If this is required, per-CPU callback lists - * will be needed. - */ - -/* Global list of callbacks and associated lock. */ -static struct rcu_head *rcu_tasks_cbs_head; -static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; -static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); -static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); - -/* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); - -/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ -#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) -static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; -module_param(rcu_task_stall_timeout, int, 0644); - -static struct task_struct *rcu_tasks_kthread_ptr; - -/** - * call_rcu_tasks() - Queue an RCU for invocation task-based grace period - * @rhp: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_tasks() assumes - * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, - * or transition to usermode execution. As such, there are no read-side - * primitives analogous to rcu_read_lock() and rcu_read_unlock() because - * this primitive is intended to determine that all tasks have passed - * through a safe state, not so much for data-strcuture synchronization. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) -{ - unsigned long flags; - bool needwake; - - rhp->next = NULL; - rhp->func = func; - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - needwake = !rcu_tasks_cbs_head; - WRITE_ONCE(*rcu_tasks_cbs_tail, rhp); - rcu_tasks_cbs_tail = &rhp->next; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - /* We can't create the thread unless interrupts are enabled. */ - if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) - wake_up(&rcu_tasks_cbs_wq); -} -EXPORT_SYMBOL_GPL(call_rcu_tasks); - -/** - * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls - * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). - * - * This is a very specialized primitive, intended only for a few uses in - * tracing and other situations requiring manipulation of function - * preambles and profiling hooks. The synchronize_rcu_tasks() function - * is not (yet) intended for heavy use from multiple CPUs. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu_tasks() returns, - * each CPU is guaranteed to have executed a full memory barrier since the - * end of its last RCU-tasks read-side critical section whose beginning - * preceded the call to synchronize_rcu_tasks(). In addition, each CPU - * having an RCU-tasks read-side critical section that extends beyond - * the return from synchronize_rcu_tasks() is guaranteed to have executed - * a full memory barrier after the beginning of synchronize_rcu_tasks() - * and before the beginning of that RCU-tasks read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU - * (but again only if the system has more than one CPU). - */ -void synchronize_rcu_tasks(void) -{ - /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, - "synchronize_rcu_tasks called too soon"); - - /* Wait for the grace period. */ - wait_rcu_gp(call_rcu_tasks); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); - -/** - * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks(void) -{ - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks(); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks); - -/* See if tasks are still holding out, complain if so. */ -static void check_holdout_task(struct task_struct *t, - bool needreport, bool *firstreport) -{ - int cpu; - - if (!READ_ONCE(t->rcu_tasks_holdout) || - t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || - !READ_ONCE(t->on_rq) || - (IS_ENABLED(CONFIG_NO_HZ_FULL) && - !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { - WRITE_ONCE(t->rcu_tasks_holdout, false); - list_del_init(&t->rcu_tasks_holdout_list); - put_task_struct(t); - return; - } - rcu_request_urgent_qs_task(t); - if (!needreport) - return; - if (*firstreport) { - pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); - *firstreport = false; - } - cpu = task_cpu(t); - pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", - t, ".I"[is_idle_task(t)], - "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], - t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, - t->rcu_tasks_idle_cpu, cpu); - sched_show_task(t); -} - -/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ -static int __noreturn rcu_tasks_kthread(void *arg) -{ - unsigned long flags; - struct task_struct *g, *t; - unsigned long lastreport; - struct rcu_head *list; - struct rcu_head *next; - LIST_HEAD(rcu_tasks_holdouts); - int fract; - - /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ - housekeeping_affine(current, HK_FLAG_RCU); - - /* - * Each pass through the following loop makes one check for - * newly arrived callbacks, and, if there are some, waits for - * one RCU-tasks grace period and then invokes the callbacks. - * This loop is terminated by the system going down. ;-) - */ - for (;;) { - - /* Pick up any new callbacks. */ - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - list = rcu_tasks_cbs_head; - rcu_tasks_cbs_head = NULL; - rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - - /* If there were none, wait a bit and start over. */ - if (!list) { - wait_event_interruptible(rcu_tasks_cbs_wq, - READ_ONCE(rcu_tasks_cbs_head)); - if (!rcu_tasks_cbs_head) { - WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(HZ/10); - } - continue; - } - - /* - * Wait for all pre-existing t->on_rq and t->nvcsw - * transitions to complete. Invoking synchronize_rcu() - * suffices because all these transitions occur with - * interrupts disabled. Without this synchronize_rcu(), - * a read-side critical section that started before the - * grace period might be incorrectly seen as having started - * after the grace period. - * - * This synchronize_rcu() also dispenses with the - * need for a memory barrier on the first store to - * ->rcu_tasks_holdout, as it forces the store to happen - * after the beginning of the grace period. - */ - synchronize_rcu(); - - /* - * There were callbacks, so we need to wait for an - * RCU-tasks grace period. Start off by scanning - * the task list for tasks that are not already - * voluntarily blocked. Mark these tasks and make - * a list of them in rcu_tasks_holdouts. - */ - rcu_read_lock(); - for_each_process_thread(g, t) { - if (t != current && READ_ONCE(t->on_rq) && - !is_idle_task(t)) { - get_task_struct(t); - t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); - WRITE_ONCE(t->rcu_tasks_holdout, true); - list_add(&t->rcu_tasks_holdout_list, - &rcu_tasks_holdouts); - } - } - rcu_read_unlock(); - - /* - * Wait for tasks that are in the process of exiting. - * This does only part of the job, ensuring that all - * tasks that were previously exiting reach the point - * where they have disabled preemption, allowing the - * later synchronize_rcu() to finish the job. - */ - synchronize_srcu(&tasks_rcu_exit_srcu); - - /* - * Each pass through the following loop scans the list - * of holdout tasks, removing any that are no longer - * holdouts. When the list is empty, we are done. - */ - lastreport = jiffies; - - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ - fract = 10; - - for (;;) { - bool firstreport; - bool needreport; - int rtst; - struct task_struct *t1; - - if (list_empty(&rcu_tasks_holdouts)) - break; - - /* Slowly back off waiting for holdouts */ - schedule_timeout_interruptible(HZ/fract); - - if (fract > 1) - fract--; - - rtst = READ_ONCE(rcu_task_stall_timeout); - needreport = rtst > 0 && - time_after(jiffies, lastreport + rtst); - if (needreport) - lastreport = jiffies; - firstreport = true; - WARN_ON(signal_pending(current)); - list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) { - check_holdout_task(t, needreport, &firstreport); - cond_resched(); - } - } - - /* - * Because ->on_rq and ->nvcsw are not guaranteed - * to have a full memory barriers prior to them in the - * schedule() path, memory reordering on other CPUs could - * cause their RCU-tasks read-side critical sections to - * extend past the end of the grace period. However, - * because these ->nvcsw updates are carried out with - * interrupts disabled, we can use synchronize_rcu() - * to force the needed ordering on all such CPUs. - * - * This synchronize_rcu() also confines all - * ->rcu_tasks_holdout accesses to be within the grace - * period, avoiding the need for memory barriers for - * ->rcu_tasks_holdout accesses. - * - * In addition, this synchronize_rcu() waits for exiting - * tasks to complete their final preempt_disable() region - * of execution, cleaning up after the synchronize_srcu() - * above. - */ - synchronize_rcu(); - - /* Invoke the callbacks. */ - while (list) { - next = list->next; - local_bh_disable(); - list->func(list); - local_bh_enable(); - list = next; - cond_resched(); - } - /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_uninterruptible(HZ/10); - } -} - -/* Spawn rcu_tasks_kthread() at core_initcall() time. */ -static int __init rcu_spawn_tasks_kthread(void) -{ - struct task_struct *t; - - t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) - return 0; - smp_mb(); /* Ensure others see full kthread. */ - WRITE_ONCE(rcu_tasks_kthread_ptr, t); - return 0; -} -core_initcall(rcu_spawn_tasks_kthread); - -/* Do the srcu_read_lock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) -{ - preempt_disable(); - current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); - preempt_enable(); -} - -/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) -{ - preempt_disable(); - __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); - preempt_enable(); -} - -#endif /* #ifdef CONFIG_TASKS_RCU */ - -#ifndef CONFIG_TINY_RCU - -/* - * Print any non-default Tasks RCU settings. - */ -static void __init rcu_tasks_bootup_oddness(void) -{ -#ifdef CONFIG_TASKS_RCU - if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) - pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); - else - pr_info("\tTasks RCU enabled.\n"); -#endif /* #ifdef CONFIG_TASKS_RCU */ -} - -#endif /* #ifndef CONFIG_TINY_RCU */ - #ifdef CONFIG_PROVE_RCU /* @@ -923,6 +585,8 @@ late_initcall(rcu_verify_early_boot_tests); void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ +#include "tasks.h" + #ifndef CONFIG_TINY_RCU /* diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a1ad5b7d5521..a778554f9dad 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -29,12 +29,12 @@ void complete(struct completion *x) { unsigned long flags; - spin_lock_irqsave(&x->wait.lock, flags); + raw_spin_lock_irqsave(&x->wait.lock, flags); if (x->done != UINT_MAX) x->done++; - __wake_up_locked(&x->wait, TASK_NORMAL, 1); - spin_unlock_irqrestore(&x->wait.lock, flags); + swake_up_locked(&x->wait); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete); @@ -58,10 +58,12 @@ void complete_all(struct completion *x) { unsigned long flags; - spin_lock_irqsave(&x->wait.lock, flags); + lockdep_assert_RT_in_threaded_ctx(); + + raw_spin_lock_irqsave(&x->wait.lock, flags); x->done = UINT_MAX; - __wake_up_locked(&x->wait, TASK_NORMAL, 0); - spin_unlock_irqrestore(&x->wait.lock, flags); + swake_up_all_locked(&x->wait); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); @@ -70,20 +72,20 @@ do_wait_for_common(struct completion *x, long (*action)(long), long timeout, int state) { if (!x->done) { - DECLARE_WAITQUEUE(wait, current); + DECLARE_SWAITQUEUE(wait); - __add_wait_queue_entry_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } + __prepare_to_swait(&x->wait, &wait); __set_current_state(state); - spin_unlock_irq(&x->wait.lock); + raw_spin_unlock_irq(&x->wait.lock); timeout = action(timeout); - spin_lock_irq(&x->wait.lock); + raw_spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); - __remove_wait_queue(&x->wait, &wait); + __finish_swait(&x->wait, &wait); if (!x->done) return timeout; } @@ -100,9 +102,9 @@ __wait_for_common(struct completion *x, complete_acquire(x); - spin_lock_irq(&x->wait.lock); + raw_spin_lock_irq(&x->wait.lock); timeout = do_wait_for_common(x, action, timeout, state); - spin_unlock_irq(&x->wait.lock); + raw_spin_unlock_irq(&x->wait.lock); complete_release(x); @@ -291,12 +293,12 @@ bool try_wait_for_completion(struct completion *x) if (!READ_ONCE(x->done)) return false; - spin_lock_irqsave(&x->wait.lock, flags); + raw_spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = false; else if (x->done != UINT_MAX) x->done--; - spin_unlock_irqrestore(&x->wait.lock, flags); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(try_wait_for_completion); @@ -322,8 +324,8 @@ bool completion_done(struct completion *x) * otherwise we can end up freeing the completion before complete() * is done referencing it. */ - spin_lock_irqsave(&x->wait.lock, flags); - spin_unlock_irqrestore(&x->wait.lock, flags); + raw_spin_lock_irqsave(&x->wait.lock, flags); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); return true; } EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1a9983da4408..5ca567adfcb9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -269,7 +269,6 @@ static void __hrtick_start(void *arg) rq_lock(rq, &rf); __hrtick_restart(rq); - rq->hrtick_csd_pending = 0; rq_unlock(rq, &rf); } @@ -293,12 +292,10 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_set_expires(timer, time); - if (rq == this_rq()) { + if (rq == this_rq()) __hrtick_restart(rq); - } else if (!rq->hrtick_csd_pending) { + else smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); - rq->hrtick_csd_pending = 1; - } } #else @@ -322,8 +319,6 @@ void hrtick_start(struct rq *rq, u64 delay) static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP - rq->hrtick_csd_pending = 0; - rq->hrtick_csd.flags = 0; rq->hrtick_csd.func = __hrtick_start; rq->hrtick_csd.info = rq; @@ -761,7 +756,6 @@ static void set_load_weight(struct task_struct *p, bool update_load) if (task_has_idle_policy(p)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; - p->se.runnable_weight = load->weight; return; } @@ -774,7 +768,6 @@ static void set_load_weight(struct task_struct *p, bool update_load) } else { load->weight = scale_load(sched_prio_to_weight[prio]); load->inv_weight = sched_prio_to_wmult[prio]; - p->se.runnable_weight = load->weight; } } @@ -1652,7 +1645,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(p->cpus_ptr, new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + /* + * Picking a ~random cpu helps in cases where we are changing affinity + * for groups of tasks (ie. cpuset), so that load balancing is not + * immediately required to distribute the tasks within their new mask. + */ + dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; @@ -2121,12 +2119,6 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) return cpu; } -static void update_avg(u64 *avg, u64 sample) -{ - s64 diff = sample - *avg; - *avg += diff >> 3; -} - void sched_set_stop_task(int cpu, struct task_struct *stop) { struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; @@ -2574,6 +2566,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) @@ -2648,6 +2642,52 @@ out: } /** + * try_invoke_on_locked_down_task - Invoke a function on task in fixed state + * @p: Process for which the function is to be invoked. + * @func: Function to invoke. + * @arg: Argument to function. + * + * If the specified task can be quickly locked into a definite state + * (either sleeping or on a given runqueue), arrange to keep it in that + * state while invoking @func(@arg). This function can use ->on_rq and + * task_curr() to work out what the state is, if required. Given that + * @func can be invoked with a runqueue lock held, it had better be quite + * lightweight. + * + * Returns: + * @false if the task slipped out from under the locks. + * @true if the task was locked onto a runqueue or is sleeping. + * However, @func can override this by returning @false. + */ +bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) +{ + bool ret = false; + struct rq_flags rf; + struct rq *rq; + + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq(&p->pi_lock); + if (p->on_rq) { + rq = __task_rq_lock(p, &rf); + if (task_rq(p) == rq) + ret = func(p, arg); + rq_unlock(rq, &rf); + } else { + switch (p->state) { + case TASK_RUNNING: + case TASK_WAKING: + break; + default: + smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). + if (!p->on_rq) + ret = func(p, arg); + } + } + raw_spin_unlock_irq(&p->pi_lock); + return ret; +} + +/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -3578,6 +3618,17 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +DEFINE_PER_CPU(unsigned long, thermal_pressure); + +void arch_set_thermal_pressure(struct cpumask *cpus, + unsigned long th_pressure) +{ + int cpu; + + for_each_cpu(cpu, cpus) + WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); +} + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -3588,12 +3639,16 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; struct rq_flags rf; + unsigned long thermal_pressure; + arch_scale_freq_tick(); sched_clock_tick(); rq_lock(rq, &rf); update_rq_clock(rq); + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); curr->sched_class->task_tick(rq, curr, 0); calc_global_load_tick(rq); psi_task_tick(rq); @@ -3671,7 +3726,6 @@ static void sched_tick_remote(struct work_struct *work) if (cpu_is_offline(cpu)) goto out_unlock; - curr = rq->curr; update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -4074,6 +4128,8 @@ static void __sched notrace __schedule(bool preempt) */ ++*switch_count; + psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + trace_sched_switch(preempt, prev, next); /* Also unlocks the rq: */ @@ -4112,7 +4168,8 @@ static inline void sched_submit_work(struct task_struct *tsk) * it wants to wake up a task to maintain concurrency. * As this function is called inside the schedule() context, * we disable preemption to avoid it calling schedule() again - * in the possible wakeup of a kworker. + * in the possible wakeup of a kworker and because wq_worker_sleeping() + * requires it. */ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { preempt_disable(); @@ -6685,7 +6742,6 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON - rq->last_load_update_tick = jiffies; rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); #endif diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 1a2719e1350a..0033731a0797 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -41,8 +41,67 @@ static int convert_prio(int prio) return cpupri; } +static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask, int idx) +{ + struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + int skip = 0; + + if (!atomic_read(&(vec)->count)) + skip = 1; + /* + * When looking at the vector, we need to read the counter, + * do a memory barrier, then read the mask. + * + * Note: This is still all racey, but we can deal with it. + * Ideally, we only want to look at masks that are set. + * + * If a mask is not set, then the only thing wrong is that we + * did a little more work than necessary. + * + * If we read a zero count but the mask is set, because of the + * memory barriers, that can only happen when the highest prio + * task for a run queue has left the run queue, in which case, + * it will be followed by a pull. If the task we are processing + * fails to find a proper place to go, that pull request will + * pull this task if the run queue is running at a lower + * priority. + */ + smp_rmb(); + + /* Need to do the rmb for every iteration */ + if (skip) + return 0; + + if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) + return 0; + + if (lowest_mask) { + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); + + /* + * We have to ensure that we have at least one bit + * still set in the array, since the map could have + * been concurrently emptied between the first and + * second reads of vec->mask. If we hit this + * condition, simply act as though we never hit this + * priority level and continue on. + */ + if (cpumask_empty(lowest_mask)) + return 0; + } + + return 1; +} + +int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask) +{ + return cpupri_find_fitness(cp, p, lowest_mask, NULL); +} + /** - * cpupri_find - find the best (lowest-pri) CPU in the system + * cpupri_find_fitness - find the best (lowest-pri) CPU in the system * @cp: The cpupri context * @p: The task * @lowest_mask: A mask to fill in with selected CPUs (or NULL) @@ -58,84 +117,59 @@ static int convert_prio(int prio) * * Return: (int)bool - CPUs were found */ -int cpupri_find(struct cpupri *cp, struct task_struct *p, +int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask, bool (*fitness_fn)(struct task_struct *p, int cpu)) { - int idx = 0; int task_pri = convert_prio(p->prio); + int idx, cpu; BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); for (idx = 0; idx < task_pri; idx++) { - struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; - int skip = 0; - - if (!atomic_read(&(vec)->count)) - skip = 1; - /* - * When looking at the vector, we need to read the counter, - * do a memory barrier, then read the mask. - * - * Note: This is still all racey, but we can deal with it. - * Ideally, we only want to look at masks that are set. - * - * If a mask is not set, then the only thing wrong is that we - * did a little more work than necessary. - * - * If we read a zero count but the mask is set, because of the - * memory barriers, that can only happen when the highest prio - * task for a run queue has left the run queue, in which case, - * it will be followed by a pull. If the task we are processing - * fails to find a proper place to go, that pull request will - * pull this task if the run queue is running at a lower - * priority. - */ - smp_rmb(); - /* Need to do the rmb for every iteration */ - if (skip) + if (!__cpupri_find(cp, p, lowest_mask, idx)) continue; - if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) - continue; + if (!lowest_mask || !fitness_fn) + return 1; - if (lowest_mask) { - int cpu; - - cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); - - /* - * We have to ensure that we have at least one bit - * still set in the array, since the map could have - * been concurrently emptied between the first and - * second reads of vec->mask. If we hit this - * condition, simply act as though we never hit this - * priority level and continue on. - */ - if (cpumask_empty(lowest_mask)) - continue; - - if (!fitness_fn) - return 1; - - /* Ensure the capacity of the CPUs fit the task */ - for_each_cpu(cpu, lowest_mask) { - if (!fitness_fn(p, cpu)) - cpumask_clear_cpu(cpu, lowest_mask); - } - - /* - * If no CPU at the current priority can fit the task - * continue looking - */ - if (cpumask_empty(lowest_mask)) - continue; + /* Ensure the capacity of the CPUs fit the task */ + for_each_cpu(cpu, lowest_mask) { + if (!fitness_fn(p, cpu)) + cpumask_clear_cpu(cpu, lowest_mask); } + /* + * If no CPU at the current priority can fit the task + * continue looking + */ + if (cpumask_empty(lowest_mask)) + continue; + return 1; } + /* + * If we failed to find a fitting lowest_mask, kick off a new search + * but without taking into account any fitness criteria this time. + * + * This rule favours honouring priority over fitting the task in the + * correct CPU (Capacity Awareness being the only user now). + * The idea is that if a higher priority task can run, then it should + * run even if this ends up being on unfitting CPU. + * + * The cost of this trade-off is not entirely clear and will probably + * be good for some workloads and bad for others. + * + * The main idea here is that if some CPUs were overcommitted, we try + * to spread which is what the scheduler traditionally did. Sys admins + * must do proper RT planning to avoid overloading the system if they + * really care. + */ + if (fitness_fn) + return cpupri_find(cp, p, lowest_mask); + return 0; } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 32dd520db11f..efbb492bb94c 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -19,8 +19,10 @@ struct cpupri { #ifdef CONFIG_SMP int cpupri_find(struct cpupri *cp, struct task_struct *p, - struct cpumask *lowest_mask, - bool (*fitness_fn)(struct task_struct *p, int cpu)); + struct cpumask *lowest_mask); +int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)); void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cff3e656566d..ff9435dee1df 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -909,8 +909,10 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) } while (read_seqcount_retry(&vtime->seqcount, seq)); } -static int vtime_state_check(struct vtime *vtime, int cpu) +static int vtime_state_fetch(struct vtime *vtime, int cpu) { + int state = READ_ONCE(vtime->state); + /* * We raced against a context switch, fetch the * kcpustat task again. @@ -927,10 +929,10 @@ static int vtime_state_check(struct vtime *vtime, int cpu) * * Case 1) is ok but 2) is not. So wait for a safe VTIME state. */ - if (vtime->state == VTIME_INACTIVE) + if (state == VTIME_INACTIVE) return -EAGAIN; - return 0; + return state; } static u64 kcpustat_user_vtime(struct vtime *vtime) @@ -949,14 +951,15 @@ static int kcpustat_field_vtime(u64 *cpustat, { struct vtime *vtime = &tsk->vtime; unsigned int seq; - int err; do { + int state; + seq = read_seqcount_begin(&vtime->seqcount); - err = vtime_state_check(vtime, cpu); - if (err < 0) - return err; + state = vtime_state_fetch(vtime, cpu); + if (state < 0) + return state; *val = cpustat[usage]; @@ -969,7 +972,7 @@ static int kcpustat_field_vtime(u64 *cpustat, */ switch (usage) { case CPUTIME_SYSTEM: - if (vtime->state == VTIME_SYS) + if (state == VTIME_SYS) *val += vtime->stime + vtime_delta(vtime); break; case CPUTIME_USER: @@ -981,11 +984,11 @@ static int kcpustat_field_vtime(u64 *cpustat, *val += kcpustat_user_vtime(vtime); break; case CPUTIME_GUEST: - if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0) + if (state == VTIME_GUEST && task_nice(tsk) <= 0) *val += vtime->gtime + vtime_delta(vtime); break; case CPUTIME_GUEST_NICE: - if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0) + if (state == VTIME_GUEST && task_nice(tsk) > 0) *val += vtime->gtime + vtime_delta(vtime); break; default: @@ -1000,12 +1003,12 @@ u64 kcpustat_field(struct kernel_cpustat *kcpustat, enum cpu_usage_stat usage, int cpu) { u64 *cpustat = kcpustat->cpustat; + u64 val = cpustat[usage]; struct rq *rq; - u64 val; int err; if (!vtime_accounting_enabled_cpu(cpu)) - return cpustat[usage]; + return val; rq = cpu_rq(cpu); @@ -1036,23 +1039,23 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, { struct vtime *vtime = &tsk->vtime; unsigned int seq; - int err; do { u64 *cpustat; u64 delta; + int state; seq = read_seqcount_begin(&vtime->seqcount); - err = vtime_state_check(vtime, cpu); - if (err < 0) - return err; + state = vtime_state_fetch(vtime, cpu); + if (state < 0) + return state; *dst = *src; cpustat = dst->cpustat; /* Task is sleeping, dead or idle, nothing to add */ - if (vtime->state < VTIME_SYS) + if (state < VTIME_SYS) continue; delta = vtime_delta(vtime); @@ -1061,15 +1064,15 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, * Task runs either in user (including guest) or kernel space, * add pending nohz time to the right place. */ - if (vtime->state == VTIME_SYS) { + if (state == VTIME_SYS) { cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; - } else if (vtime->state == VTIME_USER) { + } else if (state == VTIME_USER) { if (task_nice(tsk) > 0) cpustat[CPUTIME_NICE] += vtime->utime + delta; else cpustat[CPUTIME_USER] += vtime->utime + delta; } else { - WARN_ON_ONCE(vtime->state != VTIME_GUEST); + WARN_ON_ONCE(state != VTIME_GUEST); if (task_nice(tsk) > 0) { cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; cpustat[CPUTIME_NICE] += vtime->gtime + delta; @@ -1080,7 +1083,7 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, } } while (read_seqcount_retry(&vtime->seqcount, seq)); - return err; + return 0; } void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 43323f875cb9..504d2f51b0d6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -153,7 +153,7 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) __sub_running_bw(dl_se->dl_bw, dl_rq); } -void dl_change_utilization(struct task_struct *p, u64 new_bw) +static void dl_change_utilization(struct task_struct *p, u64 new_bw) { struct rq *rq; @@ -334,6 +334,8 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) return dl_rq->root.rb_leftmost == &dl_se->rb_node; } +static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); + void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) { raw_spin_lock_init(&dl_b->dl_runtime_lock); @@ -2496,7 +2498,7 @@ int sched_dl_global_validate(void) return ret; } -void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) +static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) { if (global_rt_runtime() == RUNTIME_INF) { dl_rq->bw_ratio = 1 << RATIO_SHIFT; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 879d3ccf3806..a562df57a86e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -402,11 +402,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group } P(se->load.weight); - P(se->runnable_weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); - P(se->avg.runnable_load_avg); + P(se->avg.runnable_avg); #endif #undef PN_SCHEDSTAT @@ -524,11 +523,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight); SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); - SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", - cfs_rq->avg.runnable_load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg", + cfs_rq->avg.runnable_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", @@ -537,8 +535,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->removed.load_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", cfs_rq->removed.util_avg); - SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum", - cfs_rq->removed.runnable_sum); + SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_avg", + cfs_rq->removed.runnable_avg); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", cfs_rq->tg_load_avg_contrib); @@ -818,10 +816,12 @@ static int __init init_sched_debug_procfs(void) __initcall(init_sched_debug_procfs); -#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) -#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) -#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F)) +#define __P(F) __PS(#F, F) +#define P(F) __PS(#F, p->F) +#define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F))) +#define __PN(F) __PSN(#F, F) +#define PN(F) __PSN(#F, p->F) #ifdef CONFIG_NUMA_BALANCING @@ -870,18 +870,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, SEQ_printf(m, "---------------------------------------------------------" "----------\n"); -#define __P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) -#define P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) -#define P_SCHEDSTAT(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F)) -#define __PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) -#define PN_SCHEDSTAT(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F))) + +#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->F)) +#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F)) PN(se.exec_start); PN(se.vruntime); @@ -941,24 +932,27 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, } __P(nr_switches); - SEQ_printf(m, "%-45s:%21Ld\n", - "nr_voluntary_switches", (long long)p->nvcsw); - SEQ_printf(m, "%-45s:%21Ld\n", - "nr_involuntary_switches", (long long)p->nivcsw); + __PS("nr_voluntary_switches", p->nvcsw); + __PS("nr_involuntary_switches", p->nivcsw); P(se.load.weight); - P(se.runnable_weight); #ifdef CONFIG_SMP P(se.avg.load_sum); - P(se.avg.runnable_load_sum); + P(se.avg.runnable_sum); P(se.avg.util_sum); P(se.avg.load_avg); - P(se.avg.runnable_load_avg); + P(se.avg.runnable_avg); P(se.avg.util_avg); P(se.avg.last_update_time); P(se.avg.util_est.ewma); P(se.avg.util_est.enqueued); #endif +#ifdef CONFIG_UCLAMP_TASK + __PS("uclamp.min", p->uclamp[UCLAMP_MIN].value); + __PS("uclamp.max", p->uclamp[UCLAMP_MAX].value); + __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN)); + __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX)); +#endif P(policy); P(prio); if (task_has_dl_policy(p)) { @@ -966,11 +960,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(dl.deadline); } #undef PN_SCHEDSTAT -#undef PN -#undef __PN #undef P_SCHEDSTAT -#undef P -#undef __P { unsigned int this_cpu = raw_smp_processor_id(); @@ -978,8 +968,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, t0 = cpu_clock(this_cpu); t1 = cpu_clock(this_cpu); - SEQ_printf(m, "%-45s:%21Ld\n", - "clock-delta", (long long)(t1-t0)); + __PS("clock-delta", t1-t0); } sched_show_numa(p, m); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c1217bfe5e81..02f323b85b6d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -86,6 +86,19 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +int sched_thermal_decay_shift; +static int __init setup_sched_thermal_decay_shift(char *str) +{ + int _shift = 0; + + if (kstrtoint(str, 0, &_shift)) + pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); + + sched_thermal_decay_shift = clamp(_shift, 0, 10); + return 1; +} +__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); + #ifdef CONFIG_SMP /* * For asym packing, by default the lower numbered CPU has higher priority. @@ -741,9 +754,7 @@ void init_entity_runnable_average(struct sched_entity *se) * nothing has been attached to the task group yet. */ if (entity_is_task(se)) - sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); - - se->runnable_weight = se->load.weight; + sa->load_avg = scale_load_down(se->load.weight); /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } @@ -796,6 +807,8 @@ void post_init_entity_util_avg(struct task_struct *p) } } + sa->runnable_avg = cpu_scale; + if (p->sched_class != &fair_sched_class) { /* * For !fair tasks do: @@ -1473,36 +1486,51 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); - -static unsigned long cpu_runnable_load(struct rq *rq) -{ - return cfs_rq_runnable_load_avg(&rq->cfs); -} +/* + * 'numa_type' describes the node at the moment of load balancing. + */ +enum numa_type { + /* The node has spare capacity that can be used to run more tasks. */ + node_has_spare = 0, + /* + * The node is fully used and the tasks don't compete for more CPU + * cycles. Nevertheless, some tasks might wait before running. + */ + node_fully_busy, + /* + * The node is overloaded and can't provide expected CPU cycles to all + * tasks. + */ + node_overloaded +}; /* Cached statistics for all CPUs within a node */ struct numa_stats { unsigned long load; - + unsigned long util; /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; + unsigned int nr_running; + unsigned int weight; + enum numa_type node_type; + int idle_cpu; }; -/* - * XXX borrowed from update_sg_lb_stats - */ -static void update_numa_stats(struct numa_stats *ns, int nid) +static inline bool is_core_idle(int cpu) { - int cpu; +#ifdef CONFIG_SCHED_SMT + int sibling; - memset(ns, 0, sizeof(*ns)); - for_each_cpu(cpu, cpumask_of_node(nid)) { - struct rq *rq = cpu_rq(cpu); + for_each_cpu(sibling, cpu_smt_mask(cpu)) { + if (cpu == sibling) + continue; - ns->load += cpu_runnable_load(rq); - ns->compute_capacity += capacity_of(cpu); + if (!idle_cpu(cpu)) + return false; } +#endif + return true; } struct task_numa_env { @@ -1521,20 +1549,128 @@ struct task_numa_env { int best_cpu; }; +static unsigned long cpu_load(struct rq *rq); +static unsigned long cpu_util(int cpu); +static inline long adjust_numa_imbalance(int imbalance, int src_nr_running); + +static inline enum +numa_type numa_classify(unsigned int imbalance_pct, + struct numa_stats *ns) +{ + if ((ns->nr_running > ns->weight) && + ((ns->compute_capacity * 100) < (ns->util * imbalance_pct))) + return node_overloaded; + + if ((ns->nr_running < ns->weight) || + ((ns->compute_capacity * 100) > (ns->util * imbalance_pct))) + return node_has_spare; + + return node_fully_busy; +} + +#ifdef CONFIG_SCHED_SMT +/* Forward declarations of select_idle_sibling helpers */ +static inline bool test_idle_cores(int cpu, bool def); +static inline int numa_idle_core(int idle_core, int cpu) +{ + if (!static_branch_likely(&sched_smt_present) || + idle_core >= 0 || !test_idle_cores(cpu, false)) + return idle_core; + + /* + * Prefer cores instead of packing HT siblings + * and triggering future load balancing. + */ + if (is_core_idle(cpu)) + idle_core = cpu; + + return idle_core; +} +#else +static inline int numa_idle_core(int idle_core, int cpu) +{ + return idle_core; +} +#endif + +/* + * Gather all necessary information to make NUMA balancing placement + * decisions that are compatible with standard load balancer. This + * borrows code and logic from update_sg_lb_stats but sharing a + * common implementation is impractical. + */ +static void update_numa_stats(struct task_numa_env *env, + struct numa_stats *ns, int nid, + bool find_idle) +{ + int cpu, idle_core = -1; + + memset(ns, 0, sizeof(*ns)); + ns->idle_cpu = -1; + + rcu_read_lock(); + for_each_cpu(cpu, cpumask_of_node(nid)) { + struct rq *rq = cpu_rq(cpu); + + ns->load += cpu_load(rq); + ns->util += cpu_util(cpu); + ns->nr_running += rq->cfs.h_nr_running; + ns->compute_capacity += capacity_of(cpu); + + if (find_idle && !rq->nr_running && idle_cpu(cpu)) { + if (READ_ONCE(rq->numa_migrate_on) || + !cpumask_test_cpu(cpu, env->p->cpus_ptr)) + continue; + + if (ns->idle_cpu == -1) + ns->idle_cpu = cpu; + + idle_core = numa_idle_core(idle_core, cpu); + } + } + rcu_read_unlock(); + + ns->weight = cpumask_weight(cpumask_of_node(nid)); + + ns->node_type = numa_classify(env->imbalance_pct, ns); + + if (idle_core >= 0) + ns->idle_cpu = idle_core; +} + static void task_numa_assign(struct task_numa_env *env, struct task_struct *p, long imp) { struct rq *rq = cpu_rq(env->dst_cpu); - /* Bail out if run-queue part of active NUMA balance. */ - if (xchg(&rq->numa_migrate_on, 1)) + /* Check if run-queue part of active NUMA balance. */ + if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) { + int cpu; + int start = env->dst_cpu; + + /* Find alternative idle CPU. */ + for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) { + if (cpu == env->best_cpu || !idle_cpu(cpu) || + !cpumask_test_cpu(cpu, env->p->cpus_ptr)) { + continue; + } + + env->dst_cpu = cpu; + rq = cpu_rq(env->dst_cpu); + if (!xchg(&rq->numa_migrate_on, 1)) + goto assign; + } + + /* Failed to find an alternative idle CPU */ return; + } +assign: /* * Clear previous best_cpu/rq numa-migrate flag, since task now * found a better CPU to move/swap. */ - if (env->best_cpu != -1) { + if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) { rq = cpu_rq(env->best_cpu); WRITE_ONCE(rq->numa_migrate_on, 0); } @@ -1590,7 +1726,7 @@ static bool load_too_imbalanced(long src_load, long dst_load, * into account that it might be best if task running on the dst_cpu should * be exchanged with the source task */ -static void task_numa_compare(struct task_numa_env *env, +static bool task_numa_compare(struct task_numa_env *env, long taskimp, long groupimp, bool maymove) { struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); @@ -1601,9 +1737,10 @@ static void task_numa_compare(struct task_numa_env *env, int dist = env->dist; long moveimp = imp; long load; + bool stopsearch = false; if (READ_ONCE(dst_rq->numa_migrate_on)) - return; + return false; rcu_read_lock(); cur = rcu_dereference(dst_rq->curr); @@ -1614,8 +1751,10 @@ static void task_numa_compare(struct task_numa_env *env, * Because we have preemption enabled we can get migrated around and * end try selecting ourselves (current == env->p) as a swap candidate. */ - if (cur == env->p) + if (cur == env->p) { + stopsearch = true; goto unlock; + } if (!cur) { if (maymove && moveimp >= env->best_imp) @@ -1624,18 +1763,27 @@ static void task_numa_compare(struct task_numa_env *env, goto unlock; } + /* Skip this swap candidate if cannot move to the source cpu. */ + if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) + goto unlock; + + /* + * Skip this swap candidate if it is not moving to its preferred + * node and the best task is. + */ + if (env->best_task && + env->best_task->numa_preferred_nid == env->src_nid && + cur->numa_preferred_nid != env->src_nid) { + goto unlock; + } + /* * "imp" is the fault differential for the source task between the * source and destination node. Calculate the total differential for * the source task and potential destination task. The more negative * the value is, the more remote accesses that would be expected to * be incurred if the tasks were swapped. - */ - /* Skip this swap candidate if cannot move to the source cpu */ - if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) - goto unlock; - - /* + * * If dst and source tasks are in the same NUMA group, or not * in any group then look only at task weights. */ @@ -1662,6 +1810,19 @@ static void task_numa_compare(struct task_numa_env *env, task_weight(cur, env->dst_nid, dist); } + /* Discourage picking a task already on its preferred node */ + if (cur->numa_preferred_nid == env->dst_nid) + imp -= imp / 16; + + /* + * Encourage picking a task that moves to its preferred node. + * This potentially makes imp larger than it's maximum of + * 1998 (see SMALLIMP and task_weight for why) but in this + * case, it does not matter. + */ + if (cur->numa_preferred_nid == env->src_nid) + imp += imp / 8; + if (maymove && moveimp > imp && moveimp > env->best_imp) { imp = moveimp; cur = NULL; @@ -1669,6 +1830,15 @@ static void task_numa_compare(struct task_numa_env *env, } /* + * Prefer swapping with a task moving to its preferred node over a + * task that is not. + */ + if (env->best_task && cur->numa_preferred_nid == env->src_nid && + env->best_task->numa_preferred_nid != env->src_nid) { + goto assign; + } + + /* * If the NUMA importance is less than SMALLIMP, * task migration might only result in ping pong * of tasks and also hurt performance due to cache @@ -1691,42 +1861,95 @@ static void task_numa_compare(struct task_numa_env *env, goto unlock; assign: - /* - * One idle CPU per node is evaluated for a task numa move. - * Call select_idle_sibling to maybe find a better one. - */ + /* Evaluate an idle CPU for a task numa move. */ if (!cur) { + int cpu = env->dst_stats.idle_cpu; + + /* Nothing cached so current CPU went idle since the search. */ + if (cpu < 0) + cpu = env->dst_cpu; + /* - * select_idle_siblings() uses an per-CPU cpumask that - * can be used from IRQ context. + * If the CPU is no longer truly idle and the previous best CPU + * is, keep using it. */ - local_irq_disable(); - env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, - env->dst_cpu); - local_irq_enable(); + if (!idle_cpu(cpu) && env->best_cpu >= 0 && + idle_cpu(env->best_cpu)) { + cpu = env->best_cpu; + } + + env->dst_cpu = cpu; } task_numa_assign(env, cur, imp); + + /* + * If a move to idle is allowed because there is capacity or load + * balance improves then stop the search. While a better swap + * candidate may exist, a search is not free. + */ + if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu)) + stopsearch = true; + + /* + * If a swap candidate must be identified and the current best task + * moves its preferred node then stop the search. + */ + if (!maymove && env->best_task && + env->best_task->numa_preferred_nid == env->src_nid) { + stopsearch = true; + } unlock: rcu_read_unlock(); + + return stopsearch; } static void task_numa_find_cpu(struct task_numa_env *env, long taskimp, long groupimp) { - long src_load, dst_load, load; bool maymove = false; int cpu; - load = task_h_load(env->p); - dst_load = env->dst_stats.load + load; - src_load = env->src_stats.load - load; - /* - * If the improvement from just moving env->p direction is better - * than swapping tasks around, check if a move is possible. + * If dst node has spare capacity, then check if there is an + * imbalance that would be overruled by the load balancer. */ - maymove = !load_too_imbalanced(src_load, dst_load, env); + if (env->dst_stats.node_type == node_has_spare) { + unsigned int imbalance; + int src_running, dst_running; + + /* + * Would movement cause an imbalance? Note that if src has + * more running tasks that the imbalance is ignored as the + * move improves the imbalance from the perspective of the + * CPU load balancer. + * */ + src_running = env->src_stats.nr_running - 1; + dst_running = env->dst_stats.nr_running + 1; + imbalance = max(0, dst_running - src_running); + imbalance = adjust_numa_imbalance(imbalance, src_running); + + /* Use idle CPU if there is no imbalance */ + if (!imbalance) { + maymove = true; + if (env->dst_stats.idle_cpu >= 0) { + env->dst_cpu = env->dst_stats.idle_cpu; + task_numa_assign(env, NULL, 0); + return; + } + } + } else { + long src_load, dst_load, load; + /* + * If the improvement from just moving env->p direction is better + * than swapping tasks around, check if a move is possible. + */ + load = task_h_load(env->p); + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; + maymove = !load_too_imbalanced(src_load, dst_load, env); + } for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { /* Skip this CPU if the source task cannot migrate */ @@ -1734,7 +1957,8 @@ static void task_numa_find_cpu(struct task_numa_env *env, continue; env->dst_cpu = cpu; - task_numa_compare(env, taskimp, groupimp, maymove); + if (task_numa_compare(env, taskimp, groupimp, maymove)) + break; } } @@ -1788,10 +2012,10 @@ static int task_numa_migrate(struct task_struct *p) dist = env.dist = node_distance(env.src_nid, env.dst_nid); taskweight = task_weight(p, env.src_nid, dist); groupweight = group_weight(p, env.src_nid, dist); - update_numa_stats(&env.src_stats, env.src_nid); + update_numa_stats(&env, &env.src_stats, env.src_nid, false); taskimp = task_weight(p, env.dst_nid, dist) - taskweight; groupimp = group_weight(p, env.dst_nid, dist) - groupweight; - update_numa_stats(&env.dst_stats, env.dst_nid); + update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); /* Try to find a spot on the preferred nid. */ task_numa_find_cpu(&env, taskimp, groupimp); @@ -1824,7 +2048,7 @@ static int task_numa_migrate(struct task_struct *p) env.dist = dist; env.dst_nid = nid; - update_numa_stats(&env.dst_stats, env.dst_nid); + update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); task_numa_find_cpu(&env, taskimp, groupimp); } } @@ -1848,15 +2072,17 @@ static int task_numa_migrate(struct task_struct *p) } /* No better CPU than the current one was found. */ - if (env.best_cpu == -1) + if (env.best_cpu == -1) { + trace_sched_stick_numa(p, env.src_cpu, NULL, -1); return -EAGAIN; + } best_rq = cpu_rq(env.best_cpu); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) - trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); + trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu); return ret; } @@ -1864,7 +2090,7 @@ static int task_numa_migrate(struct task_struct *p) WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) - trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); + trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu); put_task_struct(env.best_task); return ret; } @@ -2573,7 +2799,7 @@ static void task_numa_work(struct callback_head *work) * Skip inaccessible VMAs to avoid any confusion between * PROT_NONE and NUMA hinting ptes */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + if (!vma_is_accessible(vma)) continue; do { @@ -2835,25 +3061,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_SMP static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->runnable_weight += se->runnable_weight; - - cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; - cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; -} - -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->runnable_weight -= se->runnable_weight; - - sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); - sub_positive(&cfs_rq->avg.runnable_load_sum, - se_runnable(se) * se->avg.runnable_load_sum); -} - -static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { cfs_rq->avg.load_avg += se->avg.load_avg; @@ -2868,28 +3075,22 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) } #else static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight, unsigned long runnable) + unsigned long weight) { if (se->on_rq) { /* commit outstanding execution time */ if (cfs_rq->curr == se) update_curr(cfs_rq); account_entity_dequeue(cfs_rq, se); - dequeue_runnable_load_avg(cfs_rq, se); } dequeue_load_avg(cfs_rq, se); - se->runnable_weight = runnable; update_load_set(&se->load, weight); #ifdef CONFIG_SMP @@ -2897,16 +3098,13 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); - se->avg.runnable_load_avg = - div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider); } while (0); #endif enqueue_load_avg(cfs_rq, se); - if (se->on_rq) { + if (se->on_rq) account_entity_enqueue(cfs_rq, se); - enqueue_runnable_load_avg(cfs_rq, se); - } + } void reweight_task(struct task_struct *p, int prio) @@ -2916,7 +3114,7 @@ void reweight_task(struct task_struct *p, int prio) struct load_weight *load = &se->load; unsigned long weight = scale_load(sched_prio_to_weight[prio]); - reweight_entity(cfs_rq, se, weight, weight); + reweight_entity(cfs_rq, se, weight); load->inv_weight = sched_prio_to_wmult[prio]; } @@ -3028,50 +3226,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) */ return clamp_t(long, shares, MIN_SHARES, tg_shares); } - -/* - * This calculates the effective runnable weight for a group entity based on - * the group entity weight calculated above. - * - * Because of the above approximation (2), our group entity weight is - * an load_avg based ratio (3). This means that it includes blocked load and - * does not represent the runnable weight. - * - * Approximate the group entity's runnable weight per ratio from the group - * runqueue: - * - * grq->avg.runnable_load_avg - * ge->runnable_weight = ge->load.weight * -------------------------- (7) - * grq->avg.load_avg - * - * However, analogous to above, since the avg numbers are slow, this leads to - * transients in the from-idle case. Instead we use: - * - * ge->runnable_weight = ge->load.weight * - * - * max(grq->avg.runnable_load_avg, grq->runnable_weight) - * ----------------------------------------------------- (8) - * max(grq->avg.load_avg, grq->load.weight) - * - * Where these max() serve both to use the 'instant' values to fix the slow - * from-idle and avoid the /0 on to-idle, similar to (6). - */ -static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) -{ - long runnable, load_avg; - - load_avg = max(cfs_rq->avg.load_avg, - scale_load_down(cfs_rq->load.weight)); - - runnable = max(cfs_rq->avg.runnable_load_avg, - scale_load_down(cfs_rq->runnable_weight)); - - runnable *= shares; - if (load_avg) - runnable /= load_avg; - - return clamp_t(long, runnable, MIN_SHARES, shares); -} #endif /* CONFIG_SMP */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); @@ -3083,7 +3237,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); static void update_cfs_group(struct sched_entity *se) { struct cfs_rq *gcfs_rq = group_cfs_rq(se); - long shares, runnable; + long shares; if (!gcfs_rq) return; @@ -3092,16 +3246,15 @@ static void update_cfs_group(struct sched_entity *se) return; #ifndef CONFIG_SMP - runnable = shares = READ_ONCE(gcfs_rq->tg->shares); + shares = READ_ONCE(gcfs_rq->tg->shares); if (likely(se->load.weight == shares)) return; #else shares = calc_group_shares(gcfs_rq); - runnable = calc_group_runnable(gcfs_rq, shares); #endif - reweight_entity(cfs_rq_of(se), se, shares, runnable); + reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -3226,11 +3379,11 @@ void set_task_rq_fair(struct sched_entity *se, * _IFF_ we look at the pure running and runnable sums. Because they * represent the very same entity, just at different points in the hierarchy. * - * Per the above update_tg_cfs_util() is trivial and simply copies the running - * sum over (but still wrong, because the group entity and group rq do not have - * their PELT windows aligned). + * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial + * and simply copies the running/runnable sum over (but still wrong, because + * the group entity and group rq do not have their PELT windows aligned). * - * However, update_tg_cfs_runnable() is more complex. So we have: + * However, update_tg_cfs_load() is more complex. So we have: * * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) * @@ -3313,9 +3466,35 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { + long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* + * The relation between sum and avg is: + * + * LOAD_AVG_MAX - 1024 + sa->period_contrib + * + * however, the PELT windows are not aligned between grq and gse. + */ + + /* Set new sched_entity's runnable */ + se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; + se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq runnable */ + add_positive(&cfs_rq->avg.runnable_avg, delta); + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX; +} + +static inline void +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) +{ long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; - unsigned long runnable_load_avg, load_avg; - u64 runnable_load_sum, load_sum = 0; + unsigned long load_avg; + u64 load_sum = 0; s64 delta_sum; if (!runnable_sum) @@ -3363,20 +3542,6 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf se->avg.load_avg = load_avg; add_positive(&cfs_rq->avg.load_avg, delta_avg); add_positive(&cfs_rq->avg.load_sum, delta_sum); - - runnable_load_sum = (s64)se_runnable(se) * runnable_sum; - runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); - - if (se->on_rq) { - delta_sum = runnable_load_sum - - se_weight(se) * se->avg.runnable_load_sum; - delta_avg = runnable_load_avg - se->avg.runnable_load_avg; - add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); - add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); - } - - se->avg.runnable_load_sum = runnable_sum; - se->avg.runnable_load_avg = runnable_load_avg; } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3405,6 +3570,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) update_tg_cfs_util(cfs_rq, se, gcfs_rq); update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); + update_tg_cfs_load(cfs_rq, se, gcfs_rq); trace_pelt_cfs_tp(cfs_rq); trace_pelt_se_tp(se); @@ -3474,7 +3640,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0; + unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0; struct sched_avg *sa = &cfs_rq->avg; int decayed = 0; @@ -3485,7 +3651,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); swap(cfs_rq->removed.load_avg, removed_load); - swap(cfs_rq->removed.runnable_sum, removed_runnable_sum); + swap(cfs_rq->removed.runnable_avg, removed_runnable); cfs_rq->removed.nr = 0; raw_spin_unlock(&cfs_rq->removed.lock); @@ -3497,7 +3663,16 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * divider); - add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); + r = removed_runnable; + sub_positive(&sa->runnable_avg, r); + sub_positive(&sa->runnable_sum, r * divider); + + /* + * removed_runnable is the unweighted version of removed_load so we + * can use it to estimate removed_load_sum. + */ + add_tg_cfs_propagate(cfs_rq, + -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT); decayed = 1; } @@ -3542,17 +3717,19 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ se->avg.util_sum = se->avg.util_avg * divider; + se->avg.runnable_sum = se->avg.runnable_avg * divider; + se->avg.load_sum = divider; if (se_weight(se)) { se->avg.load_sum = div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); } - se->avg.runnable_load_sum = se->avg.load_sum; - enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + cfs_rq->avg.runnable_avg += se->avg.runnable_avg; + cfs_rq->avg.runnable_sum += se->avg.runnable_sum; add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); @@ -3574,6 +3751,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); + sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -3680,13 +3859,13 @@ static void remove_entity_load_avg(struct sched_entity *se) ++cfs_rq->removed.nr; cfs_rq->removed.util_avg += se->avg.util_avg; cfs_rq->removed.load_avg += se->avg.load_avg; - cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */ + cfs_rq->removed.runnable_avg += se->avg.runnable_avg; raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) +static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) { - return cfs_rq->avg.runnable_load_avg; + return cfs_rq->avg.runnable_avg; } static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) @@ -3957,6 +4136,7 @@ static inline void check_schedstat_required(void) #endif } +static inline bool cfs_bandwidth_used(void); /* * MIGRATION @@ -4021,8 +4201,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * - Add its new weight to cfs_rq->load.weight */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); + se_update_runnable(se); update_cfs_group(se); - enqueue_runnable_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) @@ -4035,10 +4215,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __enqueue_entity(cfs_rq, se); se->on_rq = 1; - if (cfs_rq->nr_running == 1) { + /* + * When bandwidth control is enabled, cfs might have been removed + * because of a parent been throttled but cfs->nr_running > 1. Try to + * add it unconditionnally. + */ + if (cfs_rq->nr_running == 1 || cfs_bandwidth_used()) list_add_leaf_cfs_rq(cfs_rq); + + if (cfs_rq->nr_running == 1) check_enqueue_throttle(cfs_rq); - } } static void __clear_buddies_last(struct sched_entity *se) @@ -4105,7 +4291,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * of its group cfs_rq. */ update_load_avg(cfs_rq, se, UPDATE_TG); - dequeue_runnable_load_avg(cfs_rq, se); + se_update_runnable(se); update_stats_dequeue(cfs_rq, se, flags); @@ -4541,8 +4727,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!se->on_rq) break; - if (dequeue) + if (dequeue) { dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + } else { + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); + } + qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; @@ -4610,8 +4801,13 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) enqueue = 0; cfs_rq = cfs_rq_of(se); - if (enqueue) + if (enqueue) { enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + } else { + update_load_avg(cfs_rq, se, 0); + se_update_runnable(se); + } + cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; @@ -4619,21 +4815,31 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) break; } - assert_list_leaf_cfs_rq(rq); - if (!se) add_nr_running(rq, task_delta); + /* + * The cfs_rq_throttled() breaks in the above iteration can result in + * incomplete leaf list maintenance, resulting in triggering the + * assertion below. + */ + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + list_add_leaf_cfs_rq(cfs_rq); + } + + assert_list_leaf_cfs_rq(rq); + /* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running) resched_curr(rq); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) +static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) { struct cfs_rq *cfs_rq; - u64 runtime; - u64 starting_runtime = remaining; + u64 runtime, remaining = 1; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -4648,10 +4854,13 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) /* By the above check, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + raw_spin_lock(&cfs_b->lock); runtime = -cfs_rq->runtime_remaining + 1; - if (runtime > remaining) - runtime = remaining; - remaining -= runtime; + if (runtime > cfs_b->runtime) + runtime = cfs_b->runtime; + cfs_b->runtime -= runtime; + remaining = cfs_b->runtime; + raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += runtime; @@ -4666,8 +4875,6 @@ next: break; } rcu_read_unlock(); - - return starting_runtime - remaining; } /* @@ -4678,7 +4885,6 @@ next: */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { - u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -4707,24 +4913,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u cfs_b->nr_throttled += overrun; /* - * This check is repeated as we are holding onto the new bandwidth while - * we unthrottle. This can potentially race with an unthrottled group - * trying to acquire new bandwidth from the global pool. This can result - * in us over-using our runtime if it is all used during this loop, but - * only by limited amounts in that extreme case. + * This check is repeated as we release cfs_b->lock while we unthrottle. */ while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { - runtime = cfs_b->runtime; cfs_b->distribute_running = 1; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime); + distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); - - lsub_positive(&cfs_b->runtime, runtime); } /* @@ -4858,10 +5057,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime); + distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - lsub_positive(&cfs_b->runtime, runtime); cfs_b->distribute_running = 0; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } @@ -5258,32 +5456,32 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, flags); - /* - * end evaluation on encountering a throttled cfs_rq - * - * note: in the case of encountering a throttled cfs_rq we will - * post the final h_nr_running increment below. - */ - if (cfs_rq_throttled(cfs_rq)) - break; cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto enqueue_throttle; + flags = ENQUEUE_WAKEUP; } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + + update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); + update_cfs_group(se); + cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - break; - - update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_group(se); + goto enqueue_throttle; } +enqueue_throttle: if (!se) { add_nr_running(rq, 1); /* @@ -5344,17 +5542,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); - /* - * end evaluation on encountering a throttled cfs_rq - * - * note: in the case of encountering a throttled cfs_rq we will - * post the final h_nr_running decrement below. - */ - if (cfs_rq_throttled(cfs_rq)) - break; cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto dequeue_throttle; + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -5372,16 +5566,21 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + + update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); + update_cfs_group(se); + cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - break; + goto dequeue_throttle; - update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_group(se); } +dequeue_throttle: if (!se) sub_nr_running(rq, 1); @@ -5447,6 +5646,29 @@ static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p) return load; } +static unsigned long cpu_runnable(struct rq *rq) +{ + return cfs_rq_runnable_avg(&rq->cfs); +} + +static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p) +{ + struct cfs_rq *cfs_rq; + unsigned int runnable; + + /* Task has no contribution or is new */ + if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_runnable(rq); + + cfs_rq = &rq->cfs; + runnable = READ_ONCE(cfs_rq->avg.runnable_avg); + + /* Discount task's runnable from CPU's runnable */ + lsub_positive(&runnable, p->se.avg.runnable_avg); + + return runnable; +} + static unsigned long capacity_of(int cpu) { return cpu_rq(cpu)->cpu_capacity; @@ -5786,10 +6008,12 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int bool idle = true; for_each_cpu(cpu, cpu_smt_mask(core)) { - __cpumask_clear_cpu(cpu, cpus); - if (!available_idle_cpu(cpu)) + if (!available_idle_cpu(cpu)) { idle = false; + break; + } } + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); if (idle) return core; @@ -5847,8 +6071,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; u64 avg_cost, avg_idle; - u64 time, cost; - s64 delta; + u64 time; int this = smp_processor_id(); int cpu, nr = INT_MAX; @@ -5886,14 +6109,46 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } time = cpu_clock(this) - time; - cost = this_sd->avg_scan_cost; - delta = (s64)(time - cost) / 8; - this_sd->avg_scan_cost += delta; + update_avg(&this_sd->avg_scan_cost, time); return cpu; } /* + * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which + * the task fits. If no CPU is big enough, but there are idle ones, try to + * maximize capacity. + */ +static int +select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) +{ + unsigned long best_cap = 0; + int cpu, best_cpu = -1; + struct cpumask *cpus; + + sync_entity_load_avg(&p->se); + + cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + + for_each_cpu_wrap(cpu, cpus, target) { + unsigned long cpu_cap = capacity_of(cpu); + + if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) + continue; + if (task_fits_capacity(p, cpu_cap)) + return cpu; + + if (cpu_cap > best_cap) { + best_cap = cpu_cap; + best_cpu = cpu; + } + } + + return best_cpu; +} + +/* * Try and locate an idle core/thread in the LLC cache domain. */ static int select_idle_sibling(struct task_struct *p, int prev, int target) @@ -5901,6 +6156,28 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) struct sched_domain *sd; int i, recent_used_cpu; + /* + * For asymmetric CPU capacity systems, our domain of interest is + * sd_asym_cpucapacity rather than sd_llc. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + /* + * On an asymmetric CPU capacity system where an exclusive + * cpuset defines a symmetric island (i.e. one unique + * capacity_orig value through the cpuset), the key will be set + * but the CPUs within that cpuset will not have a domain with + * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric + * capacity path. + */ + if (!sd) + goto symmetric; + + i = select_idle_capacity(p, sd, target); + return ((unsigned)i < nr_cpumask_bits) ? i : target; + } + +symmetric: if (available_idle_cpu(target) || sched_idle_cpu(target)) return target; @@ -6101,33 +6378,6 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) } /* - * Disable WAKE_AFFINE in the case where task @p doesn't fit in the - * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. - * - * In that case WAKE_AFFINE doesn't make sense and we'll let - * BALANCE_WAKE sort things out. - */ -static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) -{ - long min_cap, max_cap; - - if (!static_branch_unlikely(&sched_asym_cpucapacity)) - return 0; - - min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); - max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; - - /* Minimum capacity is close to max, no need to abort wake_affine */ - if (max_cap - min_cap < max_cap >> 3) - return 0; - - /* Bring task utilization in sync with prev_cpu */ - sync_entity_load_avg(&p->se); - - return !task_fits_capacity(p, min_cap); -} - -/* * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) * to @dst_cpu. */ @@ -6391,8 +6641,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = prev_cpu; } - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && - cpumask_test_cpu(cpu, p->cpus_ptr); + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); } rcu_read_lock(); @@ -7506,6 +7755,9 @@ static inline bool others_have_blocked(struct rq *rq) if (READ_ONCE(rq->avg_dl.util_avg)) return true; + if (thermal_load_avg(rq)) + return true; + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ if (READ_ONCE(rq->avg_irq.util_avg)) return true; @@ -7531,6 +7783,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done) { const struct sched_class *curr_class; u64 now = rq_clock_pelt(rq); + unsigned long thermal_pressure; bool decayed; /* @@ -7539,8 +7792,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done) */ curr_class = rq->curr->sched_class; + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | update_irq_load_avg(rq, 0); if (others_have_blocked(rq)) @@ -7562,7 +7818,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (cfs_rq->avg.util_sum) return false; - if (cfs_rq->avg.runnable_load_sum) + if (cfs_rq->avg.runnable_sum) return false; return true; @@ -7700,7 +7956,8 @@ struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long group_capacity; - unsigned long group_util; /* Total utilization of the group */ + unsigned long group_util; /* Total utilization over the CPUs of the group */ + unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ unsigned int sum_nr_running; /* Nr of tasks running in the group */ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsigned int idle_cpus; @@ -7763,8 +8020,15 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) if (unlikely(irq >= max)) return 1; + /* + * avg_rt.util_avg and avg_dl.util_avg track binary signals + * (running and not running) with weights 0 and 1024 respectively. + * avg_thermal.load_avg tracks thermal pressure and the weighted + * average uses the actual delta max capacity(load). + */ used = READ_ONCE(rq->avg_rt.util_avg); used += READ_ONCE(rq->avg_dl.util_avg); + used += thermal_load_avg(rq); if (unlikely(used >= max)) return 1; @@ -7921,6 +8185,10 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) if (sgs->sum_nr_running < sgs->group_weight) return true; + if ((sgs->group_capacity * imbalance_pct) < + (sgs->group_runnable * 100)) + return false; + if ((sgs->group_capacity * 100) > (sgs->group_util * imbalance_pct)) return true; @@ -7946,6 +8214,10 @@ group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) (sgs->group_util * imbalance_pct)) return true; + if ((sgs->group_capacity * imbalance_pct) < + (sgs->group_runnable * 100)) + return true; + return false; } @@ -8040,6 +8312,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += cpu_load(rq); sgs->group_util += cpu_util(i); + sgs->group_runnable += cpu_runnable(rq); sgs->sum_h_nr_running += rq->cfs.h_nr_running; nr_running = rq->nr_running; @@ -8315,6 +8588,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_load += cpu_load_without(rq, p); sgs->group_util += cpu_util_without(i, p); + sgs->group_runnable += cpu_runnable_without(rq, p); local = task_running_on_cpu(i, p); sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; @@ -8345,7 +8619,8 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, * Computing avg_load makes sense only when group is fully busy or * overloaded */ - if (sgs->group_type < group_fully_busy) + if (sgs->group_type == group_fully_busy || + sgs->group_type == group_overloaded) sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / sgs->group_capacity; } @@ -8628,6 +8903,21 @@ next_group: } } +static inline long adjust_numa_imbalance(int imbalance, int src_nr_running) +{ + unsigned int imbalance_min; + + /* + * Allow a small imbalance based on a simple pair of communicating + * tasks that remain local when the source domain is almost idle. + */ + imbalance_min = 2; + if (src_nr_running <= imbalance_min) + return 0; + + return imbalance; +} + /** * calculate_imbalance - Calculate the amount of imbalance present within the * groups of a given sched_domain during load balance. @@ -8724,24 +9014,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* Consider allowing a small imbalance between NUMA groups */ - if (env->sd->flags & SD_NUMA) { - unsigned int imbalance_min; - - /* - * Compute an allowed imbalance based on a simple - * pair of communicating tasks that should remain - * local and ignore them. - * - * NOTE: Generally this would have been based on - * the domain size and this was evaluated. However, - * the benefit is similar across a range of workloads - * and machines but scaling by the domain size adds - * the risk that lower domains have to be rebalanced. - */ - imbalance_min = 2; - if (busiest->sum_nr_running <= imbalance_min) - env->imbalance = 0; - } + if (env->sd->flags & SD_NUMA) + env->imbalance = adjust_numa_imbalance(env->imbalance, + busiest->sum_nr_running); return; } @@ -8761,6 +9036,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / sds->total_capacity; + /* + * If the local group is more loaded than the selected + * busiest group don't try to pull any tasks. + */ + if (local->avg_load >= busiest->avg_load) { + env->imbalance = 0; + return; + } } /* @@ -9027,6 +9310,14 @@ static struct rq *find_busiest_queue(struct lb_env *env, case migrate_util: util = cpu_util(cpu_of(rq)); + /* + * Don't try to pull utilization from a CPU with one + * running task. Whatever its utilization, we will fail + * detach the task. + */ + if (nr_running <= 1) + continue; + if (busiest_util < util) { busiest_util = util; busiest = rq; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 008d6ac2342b..808244f3ddd9 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -149,6 +149,9 @@ __setup("nohz_full=", housekeeping_nohz_full_setup); static int __init housekeeping_isolcpus_setup(char *str) { unsigned int flags = 0; + bool illegal = false; + char *par; + int len; while (isalpha(*str)) { if (!strncmp(str, "nohz,", 5)) { @@ -169,8 +172,22 @@ static int __init housekeeping_isolcpus_setup(char *str) continue; } - pr_warn("isolcpus: Error, unknown flag\n"); - return 0; + /* + * Skip unknown sub-parameter and validate that it is not + * containing an invalid character. + */ + for (par = str, len = 0; *str && *str != ','; str++, len++) { + if (!isalpha(*str) && *str != '_') + illegal = true; + } + + if (illegal) { + pr_warn("isolcpus: Invalid flag %.*s\n", len, par); + return 0; + } + + pr_info("isolcpus: Skipped unknown flag %.*s\n", len, par); + str++; } /* Default behaviour for isolcpus without flags */ diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index bd006b79b360..b647d04d9c8b 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -121,8 +121,8 @@ accumulate_sum(u64 delta, struct sched_avg *sa, */ if (periods) { sa->load_sum = decay_load(sa->load_sum, periods); - sa->runnable_load_sum = - decay_load(sa->runnable_load_sum, periods); + sa->runnable_sum = + decay_load(sa->runnable_sum, periods); sa->util_sum = decay_load((u64)(sa->util_sum), periods); /* @@ -149,7 +149,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa, if (load) sa->load_sum += load * contrib; if (runnable) - sa->runnable_load_sum += runnable * contrib; + sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT; if (running) sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; @@ -238,7 +238,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, } static __always_inline void -___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) +___update_load_avg(struct sched_avg *sa, unsigned long load) { u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; @@ -246,7 +246,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna * Step 2: update *_avg. */ sa->load_avg = div_u64(load * sa->load_sum, divider); - sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); + sa->runnable_avg = div_u64(sa->runnable_sum, divider); WRITE_ONCE(sa->util_avg, sa->util_sum / divider); } @@ -254,33 +254,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna * sched_entity: * * task: - * se_runnable() == se_weight() + * se_weight() = se->load.weight + * se_runnable() = !!on_rq * * group: [ see update_cfs_group() ] * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg + * se_runnable() = grq->h_nr_running * - * load_sum := runnable_sum - * load_avg = se_weight(se) * runnable_avg + * runnable_sum = se_runnable() * runnable = grq->runnable_sum + * runnable_avg = runnable_sum * - * runnable_load_sum := runnable_sum - * runnable_load_avg = se_runnable(se) * runnable_avg - * - * XXX collapse load_sum and runnable_load_sum + * load_sum := runnable + * load_avg = se_weight(se) * load_sum * * cfq_rq: * + * runnable_sum = \Sum se->avg.runnable_sum + * runnable_avg = \Sum se->avg.runnable_avg + * * load_sum = \Sum se_weight(se) * se->avg.load_sum * load_avg = \Sum se->avg.load_avg - * - * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum - * runnable_load_avg = \Sum se->avg.runable_load_avg */ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) { if (___update_load_sum(now, &se->avg, 0, 0, 0)) { - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + ___update_load_avg(&se->avg, se_weight(se)); trace_pelt_se_tp(se); return 1; } @@ -290,10 +289,10 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq, + if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se), cfs_rq->curr == se)) { - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + ___update_load_avg(&se->avg, se_weight(se)); cfs_se_util_change(&se->avg); trace_pelt_se_tp(se); return 1; @@ -306,10 +305,10 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - scale_load_down(cfs_rq->runnable_weight), + cfs_rq->h_nr_running, cfs_rq->curr != NULL)) { - ___update_load_avg(&cfs_rq->avg, 1, 1); + ___update_load_avg(&cfs_rq->avg, 1); trace_pelt_cfs_tp(cfs_rq); return 1; } @@ -322,9 +321,9 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) * * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked * util_sum = cpu_scale * load_sum - * runnable_load_sum = load_sum + * runnable_sum = util_sum * - * load_avg and runnable_load_avg are not supported and meaningless. + * load_avg and runnable_avg are not supported and meaningless. * */ @@ -335,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) running, running)) { - ___update_load_avg(&rq->avg_rt, 1, 1); + ___update_load_avg(&rq->avg_rt, 1); trace_pelt_rt_tp(rq); return 1; } @@ -348,7 +347,9 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) * * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked * util_sum = cpu_scale * load_sum - * runnable_load_sum = load_sum + * runnable_sum = util_sum + * + * load_avg and runnable_avg are not supported and meaningless. * */ @@ -359,7 +360,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) running, running)) { - ___update_load_avg(&rq->avg_dl, 1, 1); + ___update_load_avg(&rq->avg_dl, 1); trace_pelt_dl_tp(rq); return 1; } @@ -367,13 +368,46 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } +#ifdef CONFIG_SCHED_THERMAL_PRESSURE +/* + * thermal: + * + * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked + * + * util_avg and runnable_load_avg are not supported and meaningless. + * + * Unlike rt/dl utilization tracking that track time spent by a cpu + * running a rt/dl task through util_avg, the average thermal pressure is + * tracked through load_avg. This is because thermal pressure signal is + * time weighted "delta" capacity unlike util_avg which is binary. + * "delta capacity" = actual capacity - + * capped capacity a cpu due to a thermal event. + */ + +int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ + if (___update_load_sum(now, &rq->avg_thermal, + capacity, + capacity, + capacity)) { + ___update_load_avg(&rq->avg_thermal, 1); + trace_pelt_thermal_tp(rq); + return 1; + } + + return 0; +} +#endif + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* * irq: * * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked * util_sum = cpu_scale * load_sum - * runnable_load_sum = load_sum + * runnable_sum = util_sum + * + * load_avg and runnable_avg are not supported and meaningless. * */ @@ -410,7 +444,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) 1); if (ret) { - ___update_load_avg(&rq->avg_irq, 1, 1); + ___update_load_avg(&rq->avg_irq, 1); trace_pelt_irq_tp(rq); } diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index afff644da065..eb034d9f024d 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -7,6 +7,26 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); +#ifdef CONFIG_SCHED_THERMAL_PRESSURE +int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); + +static inline u64 thermal_load_avg(struct rq *rq) +{ + return READ_ONCE(rq->avg_thermal.load_avg); +} +#else +static inline int +update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ + return 0; +} + +static inline u64 thermal_load_avg(struct rq *rq) +{ + return 0; +} +#endif + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ int update_irq_load_avg(struct rq *rq, u64 running); #else @@ -159,6 +179,17 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) } static inline int +update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ + return 0; +} + +static inline u64 thermal_load_avg(struct rq *rq) +{ + return 0; +} + +static inline int update_irq_load_avg(struct rq *rq, u64 running) { return 0; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 028520702717..8f45cdb6463b 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -225,7 +225,7 @@ static bool test_state(unsigned int *tasks, enum psi_states state) case PSI_MEM_FULL: return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; case PSI_CPU_SOME: - return tasks[NR_RUNNING] > 1; + return tasks[NR_RUNNING] > tasks[NR_ONCPU]; case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -669,13 +669,14 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, groupc->times[PSI_NONIDLE] += delta; } -static u32 psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set) +static void psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set, + bool wake_clock) { struct psi_group_cpu *groupc; + u32 state_mask = 0; unsigned int t, m; enum psi_states s; - u32 state_mask = 0; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -695,10 +696,10 @@ static u32 psi_group_change(struct psi_group *group, int cpu, if (!(m & (1 << t))) continue; if (groupc->tasks[t] == 0 && !psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - clear, set); + groupc->tasks[3], clear, set); psi_bug = 1; } groupc->tasks[t]--; @@ -717,7 +718,11 @@ static u32 psi_group_change(struct psi_group *group, int cpu, write_seqcount_end(&groupc->seq); - return state_mask; + if (state_mask & group->poll_states) + psi_schedule_poll_work(group, 1); + + if (wake_clock && !delayed_work_pending(&group->avgs_work)) + schedule_delayed_work(&group->avgs_work, PSI_FREQ); } static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -744,27 +749,32 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter) return &psi_system; } -void psi_task_change(struct task_struct *task, int clear, int set) +static void psi_flags_change(struct task_struct *task, int clear, int set) { - int cpu = task_cpu(task); - struct psi_group *group; - bool wake_clock = true; - void *iter = NULL; - - if (!task->pid) - return; - if (((task->psi_flags & set) || (task->psi_flags & clear) != clear) && !psi_bug) { printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", - task->pid, task->comm, cpu, + task->pid, task->comm, task_cpu(task), task->psi_flags, clear, set); psi_bug = 1; } task->psi_flags &= ~clear; task->psi_flags |= set; +} + +void psi_task_change(struct task_struct *task, int clear, int set) +{ + int cpu = task_cpu(task); + struct psi_group *group; + bool wake_clock = true; + void *iter = NULL; + + if (!task->pid) + return; + + psi_flags_change(task, clear, set); /* * Periodic aggregation shuts off if there is a period of no @@ -777,14 +787,51 @@ void psi_task_change(struct task_struct *task, int clear, int set) wq_worker_last_func(task) == psi_avgs_work)) wake_clock = false; - while ((group = iterate_groups(task, &iter))) { - u32 state_mask = psi_group_change(group, cpu, clear, set); + while ((group = iterate_groups(task, &iter))) + psi_group_change(group, cpu, clear, set, wake_clock); +} + +void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep) +{ + struct psi_group *group, *common = NULL; + int cpu = task_cpu(prev); + void *iter; + + if (next->pid) { + psi_flags_change(next, 0, TSK_ONCPU); + /* + * When moving state between tasks, the group that + * contains them both does not change: we can stop + * updating the tree once we reach the first common + * ancestor. Iterate @next's ancestors until we + * encounter @prev's state. + */ + iter = NULL; + while ((group = iterate_groups(next, &iter))) { + if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + common = group; + break; + } + + psi_group_change(group, cpu, 0, TSK_ONCPU, true); + } + } + + /* + * If this is a voluntary sleep, dequeue will have taken care + * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We + * only need to deal with it during preemption. + */ + if (sleep) + return; - if (state_mask & group->poll_states) - psi_schedule_poll_work(group, 1); + if (prev->pid) { + psi_flags_change(prev, TSK_ONCPU, 0); - if (wake_clock && !delayed_work_pending(&group->avgs_work)) - schedule_delayed_work(&group->avgs_work, PSI_FREQ); + iter = NULL; + while ((group = iterate_groups(prev, &iter)) && group != common) + psi_group_change(group, cpu, TSK_ONCPU, 0, true); } } @@ -818,17 +865,17 @@ void psi_memstall_enter(unsigned long *flags) if (static_branch_likely(&psi_disabled)) return; - *flags = current->flags & PF_MEMSTALL; + *flags = current->in_memstall; if (*flags) return; /* - * PF_MEMSTALL setting & accounting needs to be atomic wrt + * in_memstall setting & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we can * race with CPU migration. */ rq = this_rq_lock_irq(&rf); - current->flags |= PF_MEMSTALL; + current->in_memstall = 1; psi_task_change(current, 0, TSK_MEMSTALL); rq_unlock_irq(rq, &rf); @@ -851,13 +898,13 @@ void psi_memstall_leave(unsigned long *flags) if (*flags) return; /* - * PF_MEMSTALL clearing & accounting needs to be atomic wrt + * in_memstall clearing & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we could * race with CPU migration. */ rq = this_rq_lock_irq(&rf); - current->flags &= ~PF_MEMSTALL; + current->in_memstall = 0; psi_task_change(current, TSK_MEMSTALL, 0); rq_unlock_irq(rq, &rf); @@ -916,12 +963,14 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) rq = task_rq_lock(task, &rf); - if (task_on_rq_queued(task)) + if (task_on_rq_queued(task)) { task_flags = TSK_RUNNING; - else if (task->in_iowait) + if (task_current(rq, task)) + task_flags |= TSK_ONCPU; + } else if (task->in_iowait) task_flags = TSK_IOWAIT; - if (task->flags & PF_MEMSTALL) + if (task->in_memstall) task_flags |= TSK_MEMSTALL; if (task_flags) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4043abe45459..df11d88c9895 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1475,6 +1475,13 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) int target = find_lowest_rq(p); /* + * Bail out if we were forcing a migration to find a better + * fitting CPU but our search failed. + */ + if (!test && target != -1 && !rt_task_fits_capacity(p, target)) + goto out_unlock; + + /* * Don't bother moving it if the destination CPU is * not running a lower priority task. */ @@ -1482,6 +1489,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) p->prio < cpu_rq(target)->rt.highest_prio.curr) cpu = target; } + +out_unlock: rcu_read_unlock(); out: @@ -1495,7 +1504,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) return; /* @@ -1503,7 +1512,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * see if it is pushed or pulled somewhere else. */ if (p->nr_cpus_allowed != 1 && - cpupri_find(&rq->rd->cpupri, p, NULL, NULL)) + cpupri_find(&rq->rd->cpupri, p, NULL)) return; /* @@ -1647,8 +1656,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr) && - rt_task_fits_capacity(p, cpu)) + cpumask_test_cpu(cpu, p->cpus_ptr)) return 1; return 0; @@ -1682,6 +1690,7 @@ static int find_lowest_rq(struct task_struct *task) struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); + int ret; /* Make sure the mask is initialized first */ if (unlikely(!lowest_mask)) @@ -1690,8 +1699,22 @@ static int find_lowest_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ - if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask, - rt_task_fits_capacity)) + /* + * If we're on asym system ensure we consider the different capacities + * of the CPUs when searching for the lowest_mask. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + + ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, + task, lowest_mask, + rt_task_fits_capacity); + } else { + + ret = cpupri_find(&task_rq(task)->rd->cpupri, + task, lowest_mask); + } + + if (!ret) return -1; /* No targets found */ /* @@ -2202,7 +2225,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio); - if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq))) + if (need_to_push) push_rt_tasks(rq); } @@ -2274,10 +2297,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - bool need_to_push = rq->rt.overloaded || - !rt_task_fits_capacity(p, cpu_of(rq)); - - if (p->nr_cpus_allowed > 1 && need_to_push) + if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) @@ -2449,10 +2469,11 @@ const struct sched_class rt_sched_class = { */ static DEFINE_MUTEX(rt_constraints_mutex); -/* Must be called with tasklist_lock held */ static inline int tg_has_rt_tasks(struct task_group *tg) { - struct task_struct *g, *p; + struct task_struct *task; + struct css_task_iter it; + int ret = 0; /* * Autogroups do not have RT tasks; see autogroup_create(). @@ -2460,12 +2481,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg) if (task_group_is_autogroup(tg)) return 0; - for_each_process_thread(g, p) { - if (rt_task(p) && task_group(p) == tg) - return 1; - } + css_task_iter_start(&tg->css, 0, &it); + while (!ret && (task = css_task_iter_next(&it))) + ret |= rt_task(task); + css_task_iter_end(&it); - return 0; + return ret; } struct rt_schedulable_data { @@ -2496,9 +2517,10 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) return -EINVAL; /* - * Ensure we don't starve existing RT tasks. + * Ensure we don't starve existing RT tasks if runtime turns zero. */ - if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + if (rt_bandwidth_enabled() && !runtime && + tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) return -EBUSY; total = to_ratio(period, runtime); @@ -2564,7 +2586,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg, return -EINVAL; mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); err = __rt_schedulable(tg, rt_period, rt_runtime); if (err) goto unlock; @@ -2582,7 +2603,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg, } raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); unlock: - read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return err; @@ -2641,9 +2661,7 @@ static int sched_rt_global_constraints(void) int ret = 0; mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); ret = __rt_schedulable(NULL, 0, 0); - read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return ret; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9ea647835fd6..db3a57675ccf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -118,7 +118,13 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust); #ifdef CONFIG_64BIT # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) -# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT) +# define scale_load_down(w) \ +({ \ + unsigned long __w = (w); \ + if (__w) \ + __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ + __w; \ +}) #else # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) (w) @@ -189,6 +195,12 @@ static inline int task_has_dl_policy(struct task_struct *p) #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) +static inline void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff / 8; +} + /* * !! For sched_setattr_nocheck() (kernel) only !! * @@ -305,7 +317,6 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } -extern void dl_change_utilization(struct task_struct *p, u64 new_bw); extern void init_dl_bw(struct dl_bw *dl_b); extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); @@ -489,7 +500,6 @@ struct cfs_bandwidth { }; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned long runnable_weight; unsigned int nr_running; unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ @@ -528,7 +538,7 @@ struct cfs_rq { int nr; unsigned long load_avg; unsigned long util_avg; - unsigned long runnable_sum; + unsigned long runnable_avg; } removed; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -688,8 +698,30 @@ struct dl_rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) + +static inline void se_update_runnable(struct sched_entity *se) +{ + if (!entity_is_task(se)) + se->runnable_weight = se->my_q->h_nr_running; +} + +static inline long se_runnable(struct sched_entity *se) +{ + if (entity_is_task(se)) + return !!se->on_rq; + else + return se->runnable_weight; +} + #else #define entity_is_task(se) 1 + +static inline void se_update_runnable(struct sched_entity *se) {} + +static inline long se_runnable(struct sched_entity *se) +{ + return !!se->on_rq; +} #endif #ifdef CONFIG_SMP @@ -701,10 +733,6 @@ static inline long se_weight(struct sched_entity *se) return scale_load_down(se->load.weight); } -static inline long se_runnable(struct sched_entity *se) -{ - return scale_load_down(se->runnable_weight); -} static inline bool sched_asym_prefer(int a, int b) { @@ -860,7 +888,6 @@ struct rq { #endif #ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_SMP - unsigned long last_load_update_tick; unsigned long last_blocked_load_update_tick; unsigned int has_blocked_load; #endif /* CONFIG_SMP */ @@ -944,6 +971,9 @@ struct rq { #ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif +#ifdef CONFIG_SCHED_THERMAL_PRESSURE + struct sched_avg avg_thermal; +#endif u64 idle_stamp; u64 avg_idle; @@ -967,7 +997,6 @@ struct rq { #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP - int hrtick_csd_pending; call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; @@ -1107,6 +1136,24 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } +/** + * By default the decay is the default pelt decay period. + * The decay shift can change the decay period in + * multiples of 32. + * Decay shift Decay period(ms) + * 0 32 + * 1 64 + * 2 128 + * 3 256 + * 4 512 + */ +extern int sched_thermal_decay_shift; + +static inline u64 rq_clock_thermal(struct rq *rq) +{ + return rq_clock_task(rq) >> sched_thermal_decay_shift; +} + static inline void rq_clock_skip_update(struct rq *rq) { lockdep_assert_held(&rq->lock); @@ -1337,8 +1384,6 @@ extern void sched_ttwu_pending(void); for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ __sd; __sd = __sd->parent) -#define for_each_lower_domain(sd) for (; sd; sd = sd->child) - /** * highest_flag_domain - Return highest sched_domain containing flag. * @cpu: The CPU whose highest level of sched domain is to @@ -1869,7 +1914,6 @@ extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); -extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) @@ -1968,6 +2012,13 @@ static inline int hrtick_enabled(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ +#ifndef arch_scale_freq_tick +static __always_inline +void arch_scale_freq_tick(void) +{ +} +#endif + #ifndef arch_scale_freq_capacity static __always_inline unsigned long arch_scale_freq_capacity(int cpu) @@ -2492,3 +2543,6 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) return true; } #endif + +void swake_up_all_locked(struct swait_queue_head *q); +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index ba683fe81a6e..33d0daf83842 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -70,7 +70,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) return; if (!wakeup || p->sched_psi_wake_requeue) { - if (p->flags & PF_MEMSTALL) + if (p->in_memstall) set |= TSK_MEMSTALL; if (p->sched_psi_wake_requeue) p->sched_psi_wake_requeue = 0; @@ -90,9 +90,17 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) return; if (!sleep) { - if (p->flags & PF_MEMSTALL) + if (p->in_memstall) clear |= TSK_MEMSTALL; } else { + /* + * When a task sleeps, schedule() dequeues it before + * switching to the next one. Merge the clearing of + * TSK_RUNNING and TSK_ONCPU to save an unnecessary + * psi_task_change() call in psi_sched_switch(). + */ + clear |= TSK_ONCPU; + if (p->in_iowait) set |= TSK_IOWAIT; } @@ -109,14 +117,14 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) * deregister its sleep-persistent psi states from the old * queue, and let psi_enqueue() know it has to requeue. */ - if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) { + if (unlikely(p->in_iowait || p->in_memstall)) { struct rq_flags rf; struct rq *rq; int clear = 0; if (p->in_iowait) clear |= TSK_IOWAIT; - if (p->flags & PF_MEMSTALL) + if (p->in_memstall) clear |= TSK_MEMSTALL; rq = __task_rq_lock(p, &rf); @@ -126,18 +134,31 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) } } +static inline void psi_sched_switch(struct task_struct *prev, + struct task_struct *next, + bool sleep) +{ + if (static_branch_likely(&psi_disabled)) + return; + + psi_task_switch(prev, next, sleep); +} + static inline void psi_task_tick(struct rq *rq) { if (static_branch_likely(&psi_disabled)) return; - if (unlikely(rq->curr->flags & PF_MEMSTALL)) + if (unlikely(rq->curr->in_memstall)) psi_memstall_tick(rq->curr, cpu_of(rq)); } #else /* CONFIG_PSI */ static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} static inline void psi_dequeue(struct task_struct *p, bool sleep) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_sched_switch(struct task_struct *prev, + struct task_struct *next, + bool sleep) {} static inline void psi_task_tick(struct rq *rq) {} #endif /* CONFIG_PSI */ diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index e83a3f8449f6..e1c655f928c7 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -32,6 +32,19 @@ void swake_up_locked(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_locked); +/* + * Wake up all waiters. This is an interface which is solely exposed for + * completions and not for general usage. + * + * It is intentionally different from swake_up_all() to allow usage from + * hard interrupt context and interrupt disabled regions. + */ +void swake_up_all_locked(struct swait_queue_head *q) +{ + while (!list_empty(&q->task_list)) + swake_up_locked(q); +} + void swake_up_one(struct swait_queue_head *q) { unsigned long flags; @@ -69,7 +82,7 @@ void swake_up_all(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_all); -static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) { wait->task = current; if (list_empty(&wait->task_list)) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dfb64c08a407..8344757bba6e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -317,8 +317,9 @@ static void sched_energy_set(bool has_eas) * EAS can be used on a root domain if it meets all the following conditions: * 1. an Energy Model (EM) is available; * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. - * 3. the EM complexity is low enough to keep scheduling overheads low; - * 4. schedutil is driving the frequency of all CPUs of the rd; + * 3. no SMT is detected. + * 4. the EM complexity is low enough to keep scheduling overheads low; + * 5. schedutil is driving the frequency of all CPUs of the rd; * * The complexity of the Energy Model is defined as: * @@ -360,6 +361,13 @@ static bool build_perf_domains(const struct cpumask *cpu_map) goto free; } + /* EAS definitely does *not* handle SMT */ + if (sched_smt_active()) { + pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", + cpumask_pr_args(cpu_map)); + goto free; + } + for_each_cpu(i, cpu_map) { /* Skip already covered CPUs. */ if (find_pd(pd, i)) @@ -1374,18 +1382,9 @@ sd_init(struct sched_domain_topology_level *tl, * Convert topological properties into behaviour. */ - if (sd->flags & SD_ASYM_CPUCAPACITY) { - struct sched_domain *t = sd; - - /* - * Don't attempt to spread across CPUs of different capacities. - */ - if (sd->child) - sd->child->flags &= ~SD_PREFER_SIBLING; - - for_each_lower_domain(t) - t->flags |= SD_BALANCE_WAKE; - } + /* Don't attempt to spread across CPUs of different capacities. */ + if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child) + sd->child->flags &= ~SD_PREFER_SIBLING; if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->imbalance_pct = 110; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b6ea3dcb57bf..55a6184f5990 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -268,16 +268,14 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ - preempt_disable(); for (; f; f = f->prev) { - u32 cur_ret = BPF_PROG_RUN(f->prog, sd); + u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd); if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) { ret = cur_ret; *match = f; } } - preempt_enable(); return ret; } #endif /* CONFIG_SECCOMP_FILTER */ @@ -528,8 +526,12 @@ static long seccomp_attach_filter(unsigned int flags, int ret; ret = seccomp_can_sync_threads(); - if (ret) - return ret; + if (ret) { + if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) + return -ESRCH; + else + return ret; + } } /* Set log flag, if present. */ @@ -1221,6 +1223,7 @@ static const struct file_operations seccomp_notify_ops = { .poll = seccomp_notify_poll, .release = seccomp_notify_release, .unlocked_ioctl = seccomp_notify_ioctl, + .compat_ioctl = seccomp_notify_ioctl, }; static struct file *init_listener(struct seccomp_filter *filter) @@ -1288,10 +1291,12 @@ static long seccomp_set_mode_filter(unsigned int flags, * In the successful case, NEW_LISTENER returns the new listener fd. * But in the failure case, TSYNC returns the thread that died. If you * combine these two flags, there's no way to tell whether something - * succeeded or failed. So, let's disallow this combination. + * succeeded or failed. So, let's disallow this combination if the user + * has not explicitly requested no errors from TSYNC. */ if ((flags & SECCOMP_FILTER_FLAG_TSYNC) && - (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER)) + (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) && + ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0)) return -EINVAL; /* Prepare the new filter before holding any locks. */ diff --git a/kernel/signal.c b/kernel/signal.c index 5b2396350dd1..713104884414 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1510,15 +1510,15 @@ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, unsigned long flags; int ret = -EINVAL; + if (!valid_signal(sig)) + return ret; + clear_siginfo(&info); info.si_signo = sig; info.si_errno = errno; info.si_code = SI_ASYNCIO; *((sigval_t *)&info.si_pid) = addr; - if (!valid_signal(sig)) - return ret; - rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (!p) { @@ -1557,12 +1557,8 @@ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid) { int ret; - if (pid > 0) { - rcu_read_lock(); - ret = kill_pid_info(sig, info, find_vpid(pid)); - rcu_read_unlock(); - return ret; - } + if (pid > 0) + return kill_proc_info(sig, info, pid); /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */ if (pid == INT_MIN) @@ -1931,7 +1927,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) * This is only possible if parent == real_parent. * Check if it has changed security domain. */ - if (tsk->parent_exec_id != tsk->parent->self_exec_id) + if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id)) sig = SIGCHLD; } diff --git a/kernel/smp.c b/kernel/smp.c index d0ada39eb4d4..786092aabdcd 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -329,6 +329,11 @@ EXPORT_SYMBOL(smp_call_function_single); * (ie: embedded in an object) and is responsible for synchronizing it * such that the IPIs performed on the @csd are strictly serialized. * + * If the function is called with one csd which has not yet been + * processed by previous call to smp_call_function_single_async(), the + * function will return immediately with -EBUSY showing that the csd + * object is still in progress. + * * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. */ @@ -338,14 +343,17 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) preempt_disable(); - /* We could deadlock if we have to wait here with interrupts disabled! */ - if (WARN_ON_ONCE(csd->flags & CSD_FLAG_LOCK)) - csd_lock_wait(csd); + if (csd->flags & CSD_FLAG_LOCK) { + err = -EBUSY; + goto out; + } csd->flags = CSD_FLAG_LOCK; smp_wmb(); err = generic_exec_single(cpu, csd, csd->func, csd->info); + +out: preempt_enable(); return err; @@ -589,20 +597,13 @@ void __init setup_nr_cpu_ids(void) void __init smp_init(void) { int num_nodes, num_cpus; - unsigned int cpu; idle_threads_init(); cpuhp_threads_init(); pr_info("Bringing up secondary CPUs ...\n"); - /* FIXME: This should be done in userspace --RR */ - for_each_present_cpu(cpu) { - if (num_online_cpus() >= setup_max_cpus) - break; - if (!cpu_online(cpu)) - cpu_up(cpu); - } + bringup_nonboot_cpus(setup_max_cpus); num_nodes = num_online_nodes(); num_cpus = num_online_cpus(); diff --git a/kernel/softirq.c b/kernel/softirq.c index 0427a86743a4..a47c6dd57452 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -126,7 +126,7 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) * Were softirqs turned off above: */ if (softirq_count() == (cnt & SOFTIRQ_MASK)) - trace_softirqs_off(ip); + lockdep_softirqs_off(ip); raw_local_irq_restore(flags); if (preempt_count() == cnt) { @@ -147,7 +147,7 @@ static void __local_bh_enable(unsigned int cnt) trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); if (softirq_count() == (cnt & SOFTIRQ_MASK)) - trace_softirqs_on(_RET_IP_); + lockdep_softirqs_on(_RET_IP_); __preempt_count_sub(cnt); } @@ -174,7 +174,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) * Are softirqs going to be turned on now: */ if (softirq_count() == SOFTIRQ_DISABLE_OFFSET) - trace_softirqs_on(ip); + lockdep_softirqs_on(ip); /* * Keep preemption disabled until we are done with * softirq processing: @@ -224,9 +224,9 @@ static inline bool lockdep_softirq_start(void) { bool in_hardirq = false; - if (trace_hardirq_context(current)) { + if (lockdep_hardirq_context(current)) { in_hardirq = true; - trace_hardirq_exit(); + lockdep_hardirq_exit(); } lockdep_softirq_enter(); @@ -239,7 +239,7 @@ static inline void lockdep_softirq_end(bool in_hardirq) lockdep_softirq_exit(); if (in_hardirq) - trace_hardirq_enter(); + lockdep_hardirq_enter(); } #else static inline bool lockdep_softirq_start(void) { return false; } @@ -414,7 +414,8 @@ void irq_exit(void) tick_irq_exit(); rcu_irq_exit(); - trace_hardirq_exit(); /* must be last! */ + /* must be last! */ + lockdep_hardirq_exit(); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ad5b88a53c5a..8a176d8727a3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -212,6 +212,11 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_COMPACTION +static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, + int write, void __user *buffer, + size_t *lenp, loff_t *ppos); +#endif #endif #ifdef CONFIG_PRINTK @@ -229,25 +234,8 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #ifdef CONFIG_MAGIC_SYSRQ -/* Note: sysrq code uses its own private copy */ -static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; - static int sysrq_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int error; - - error = proc_dointvec(table, write, buffer, lenp, ppos); - if (error) - return error; - - if (write) - sysrq_toggle_support(__sysrq_enabled); - - return 0; -} - + void __user *buffer, size_t *lenp, loff_t *ppos); #endif static struct ctl_table kern_table[]; @@ -747,7 +735,7 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_MAGIC_SYSRQ { .procname = "sysrq", - .data = &__sysrq_enabled, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, .proc_handler = sysrq_sysctl_handler, @@ -1484,7 +1472,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_compact_unevictable_allowed, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax_warn_RT_change, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -2572,6 +2560,28 @@ int proc_dointvec(struct ctl_table *table, int write, return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); } +#ifdef CONFIG_COMPACTION +static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, + int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret, old; + + if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write) + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + old = *(int *)table->data; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret) + return ret; + if (old != *(int *)table->data) + pr_warn_once("sysctl attribute %s changed by %s[%d]\n", + table->procname, current->comm, + task_pid_nr(current)); + return ret; +} +#endif + /** * proc_douintvec - read a vector of unsigned integers * @table: the sysctl table @@ -2835,6 +2845,26 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, } #endif +#ifdef CONFIG_MAGIC_SYSRQ +static int sysrq_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int tmp, ret; + + tmp = sysrq_mask(); + + ret = __do_proc_dointvec(&tmp, table, write, buffer, + lenp, ppos, NULL, NULL); + if (ret || !write) + return ret; + + if (write) + sysrq_toggle_support(tmp); + + return 0; +} +#endif + static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, diff --git a/kernel/task_work.c b/kernel/task_work.c index 0fef395662a6..825f28259a19 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -97,16 +97,26 @@ void task_work_run(void) * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ - raw_spin_lock_irq(&task->pi_lock); do { + head = NULL; work = READ_ONCE(task->task_works); - head = !work && (task->flags & PF_EXITING) ? - &work_exited : NULL; + if (!work) { + if (task->flags & PF_EXITING) + head = &work_exited; + else + break; + } } while (cmpxchg(&task->task_works, work, head) != work); - raw_spin_unlock_irq(&task->pi_lock); if (!work) break; + /* + * Synchronize with task_work_cancel(). It can not remove + * the first entry == work, cmpxchg(task_works) must fail. + * But it can remove another entry from the ->next list. + */ + raw_spin_lock_irq(&task->pi_lock); + raw_spin_unlock_irq(&task->pi_lock); do { next = work->next; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 428beb69426a..7cb09c4cf21c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -928,6 +928,15 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_arch_init(cs); +#ifdef CONFIG_GENERIC_VDSO_CLOCK_MODE + if (cs->vdso_clock_mode < 0 || + cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) { + pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n", + cs->name, cs->vdso_clock_mode); + cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE; + } +#endif + /* Initialize mult/shift and max_idle_ns */ __clocksource_update_freq_scale(cs, scale, freq); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3a609e7344f3..d89da1c7e005 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -311,7 +311,7 @@ s64 __ktime_divns(const ktime_t kt, s64 div) div >>= 1; } tmp >>= sft; - do_div(tmp, (unsigned long) div); + do_div(tmp, (u32) div); return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); @@ -1404,7 +1404,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; - timer->is_hard = !softtimer; + timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } @@ -1480,6 +1480,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); + bool expires_in_hardirq; int restart; lockdep_assert_held(&cpu_base->lock); @@ -1514,7 +1515,11 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, */ raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); + expires_in_hardirq = lockdep_hrtimer_enter(timer); + restart = fn(timer); + + lockdep_hrtimer_exit(expires_in_hardirq); trace_hrtimer_expire_exit(timer); raw_spin_lock_irq(&cpu_base->lock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index d23b434c2ca7..eddcf4970444 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -58,7 +58,8 @@ static struct clocksource clocksource_jiffies = { .max_cycles = 10, }; -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock); +__cacheline_aligned_in_smp seqcount_t jiffies_seq; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) @@ -67,9 +68,9 @@ u64 get_jiffies_64(void) u64 ret; do { - seq = read_seqbegin(&jiffies_lock); + seq = read_seqcount_begin(&jiffies_seq); ret = jiffies_64; - } while (read_seqretry(&jiffies_lock, seq)); + } while (read_seqcount_retry(&jiffies_seq, seq)); return ret; } EXPORT_SYMBOL(get_jiffies_64); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 12858507d75a..53bce347cd50 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -8,6 +8,7 @@ #include <linux/user_namespace.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> +#include <linux/clocksource.h> #include <linux/seq_file.h> #include <linux/proc_ns.h> #include <linux/export.h> @@ -172,8 +173,8 @@ static struct timens_offset offset_from_ts(struct timespec64 off) * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the * update to finish and for 'seq' to become even anyway. * - * Timens page has vdso_data->clock_mode set to VCLOCK_TIMENS which enforces - * the time namespace handling path. + * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * enforces the time namespace handling path. */ static void timens_setup_vdso_data(struct vdso_data *vdata, struct time_namespace *ns) @@ -183,7 +184,7 @@ static void timens_setup_vdso_data(struct vdso_data *vdata, struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); vdata->seq = 1; - vdata->clock_mode = VCLOCK_TIMENS; + vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; offset[CLOCK_MONOTONIC] = monotonic; offset[CLOCK_MONOTONIC_RAW] = monotonic; offset[CLOCK_MONOTONIC_COARSE] = monotonic; @@ -337,7 +338,20 @@ static struct user_namespace *timens_owner(struct ns_common *ns) static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) { - seq_printf(m, "%d %lld %ld\n", clockid, ts->tv_sec, ts->tv_nsec); + char *clock; + + switch (clockid) { + case CLOCK_BOOTTIME: + clock = "boottime"; + break; + case CLOCK_MONOTONIC: + clock = "monotonic"; + break; + default: + clock = "unknown"; + break; + } + seq_printf(m, "%-10s %10lld %9ld\n", clock, ts->tv_sec, ts->tv_nsec); } void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) @@ -446,6 +460,7 @@ const struct proc_ns_operations timens_operations = { const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", + .real_ns_name = "time", .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 8ff6da77a01f..2fd3b3fa68bf 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -118,6 +118,16 @@ static inline int validate_clock_permissions(const clockid_t clock) return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL; } +static inline enum pid_type cpu_timer_pid_type(struct k_itimer *timer) +{ + return CPUCLOCK_PERTHREAD(timer->it_clock) ? PIDTYPE_PID : PIDTYPE_TGID; +} + +static inline struct task_struct *cpu_timer_task_rcu(struct k_itimer *timer) +{ + return pid_task(timer->it.cpu.pid, cpu_timer_pid_type(timer)); +} + /* * Update expiry time from increment, and increase overrun count, * given the current clock sample. @@ -336,9 +346,7 @@ static void __thread_group_cputime(struct task_struct *tsk, u64 *samples) /* * Sample a process (thread group) clock for the given task clkid. If the * group's cputime accounting is already enabled, read the atomic - * store. Otherwise a full update is required. Task's sighand lock must be - * held to protect the task traversal on a full update. clkid is already - * validated. + * store. Otherwise a full update is required. clkid is already validated. */ static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p, bool start) @@ -393,7 +401,12 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); - new_timer->it.cpu.task = p; + new_timer->it.cpu.pid = get_task_pid(p, cpu_timer_pid_type(new_timer)); + /* + * get_task_for_clock() took a reference on @p. Drop it as the timer + * holds a reference on the pid of @p. + */ + put_task_struct(p); return 0; } @@ -406,13 +419,15 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) static int posix_cpu_timer_del(struct k_itimer *timer) { struct cpu_timer *ctmr = &timer->it.cpu; - struct task_struct *p = ctmr->task; struct sighand_struct *sighand; + struct task_struct *p; unsigned long flags; int ret = 0; - if (WARN_ON_ONCE(!p)) - return -EINVAL; + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) + goto out; /* * Protect against sighand release/switch in exit/exec and process/ @@ -434,8 +449,10 @@ static int posix_cpu_timer_del(struct k_itimer *timer) unlock_task_sighand(p, &flags); } +out: + rcu_read_unlock(); if (!ret) - put_task_struct(p); + put_pid(ctmr->pid); return ret; } @@ -484,12 +501,11 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the sighand lock held. */ -static void arm_timer(struct k_itimer *timer) +static void arm_timer(struct k_itimer *timer, struct task_struct *p) { int clkidx = CPUCLOCK_WHICH(timer->it_clock); struct cpu_timer *ctmr = &timer->it.cpu; u64 newexp = cpu_timer_getexpires(ctmr); - struct task_struct *p = ctmr->task; struct posix_cputimer_base *base; if (CPUCLOCK_PERTHREAD(timer->it_clock)) @@ -564,13 +580,21 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); u64 old_expires, new_expires, old_incr, val; struct cpu_timer *ctmr = &timer->it.cpu; - struct task_struct *p = ctmr->task; struct sighand_struct *sighand; + struct task_struct *p; unsigned long flags; int ret = 0; - if (WARN_ON_ONCE(!p)) - return -EINVAL; + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) { + /* + * If p has just been reaped, we can no + * longer get any information about it at all. + */ + rcu_read_unlock(); + return -ESRCH; + } /* * Use the to_ktime conversion because that clamps the maximum @@ -587,8 +611,10 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * If p has just been reaped, we can no * longer get any information about it at all. */ - if (unlikely(sighand == NULL)) + if (unlikely(sighand == NULL)) { + rcu_read_unlock(); return -ESRCH; + } /* * Disarm any old timer after extracting its expiry time. @@ -662,7 +688,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, */ cpu_timer_setexpires(ctmr, new_expires); if (new_expires != 0 && val < new_expires) { - arm_timer(timer); + arm_timer(timer, p); } unlock_task_sighand(p, &flags); @@ -693,6 +719,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, ret = 0; out: + rcu_read_unlock(); if (old) old->it_interval = ns_to_timespec64(old_incr); @@ -704,10 +731,12 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); struct cpu_timer *ctmr = &timer->it.cpu; u64 now, expires = cpu_timer_getexpires(ctmr); - struct task_struct *p = ctmr->task; + struct task_struct *p; - if (WARN_ON_ONCE(!p)) - return; + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) + goto out; /* * Easy part: convert the reload time. @@ -715,36 +744,15 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp itp->it_interval = ktime_to_timespec64(timer->it_interval); if (!expires) - return; + goto out; /* * Sample the clock to take the difference with the expiry time. */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + if (CPUCLOCK_PERTHREAD(timer->it_clock)) now = cpu_clock_sample(clkid, p); - } else { - struct sighand_struct *sighand; - unsigned long flags; - - /* - * Protect against sighand release/switch in exit/exec and - * also make timer sampling safe if it ends up calling - * thread_group_cputime(). - */ - sighand = lock_task_sighand(p, &flags); - if (unlikely(sighand == NULL)) { - /* - * The process has been reaped. - * We can't even collect a sample any more. - * Disarm the timer, nothing else to do. - */ - cpu_timer_setexpires(ctmr, 0); - return; - } else { - now = cpu_clock_sample_group(clkid, p, false); - unlock_task_sighand(p, &flags); - } - } + else + now = cpu_clock_sample_group(clkid, p, false); if (now < expires) { itp->it_value = ns_to_timespec64(expires - now); @@ -756,6 +764,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp itp->it_value.tv_nsec = 1; itp->it_value.tv_sec = 0; } +out: + rcu_read_unlock(); } #define MAX_COLLECTED 20 @@ -976,56 +986,38 @@ static void check_process_timers(struct task_struct *tsk, static void posix_cpu_timer_rearm(struct k_itimer *timer) { clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); - struct cpu_timer *ctmr = &timer->it.cpu; - struct task_struct *p = ctmr->task; + struct task_struct *p; struct sighand_struct *sighand; unsigned long flags; u64 now; - if (WARN_ON_ONCE(!p)) - return; + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) + goto out; /* * Fetch the current sample and update the timer's expiry time. */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + if (CPUCLOCK_PERTHREAD(timer->it_clock)) now = cpu_clock_sample(clkid, p); - bump_cpu_timer(timer, now); - if (unlikely(p->exit_state)) - return; - - /* Protect timer list r/w in arm_timer() */ - sighand = lock_task_sighand(p, &flags); - if (!sighand) - return; - } else { - /* - * Protect arm_timer() and timer sampling in case of call to - * thread_group_cputime(). - */ - sighand = lock_task_sighand(p, &flags); - if (unlikely(sighand == NULL)) { - /* - * The process has been reaped. - * We can't even collect a sample any more. - */ - cpu_timer_setexpires(ctmr, 0); - return; - } else if (unlikely(p->exit_state) && thread_group_empty(p)) { - /* If the process is dying, no need to rearm */ - goto unlock; - } + else now = cpu_clock_sample_group(clkid, p, true); - bump_cpu_timer(timer, now); - /* Leave the sighand locked for the call below. */ - } + + bump_cpu_timer(timer, now); + + /* Protect timer list r/w in arm_timer() */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) + goto out; /* * Now re-arm for the new expiry time. */ - arm_timer(timer); -unlock: + arm_timer(timer, p); unlock_task_sighand(p, &flags); +out: + rcu_read_unlock(); } /** @@ -1126,8 +1118,11 @@ void run_posix_cpu_timers(void) if (!fastpath_timer_check(tsk)) return; - if (!lock_task_sighand(tsk, &flags)) + lockdep_posixtimer_enter(); + if (!lock_task_sighand(tsk, &flags)) { + lockdep_posixtimer_exit(); return; + } /* * Here we take off tsk->signal->cpu_timers[N] and * tsk->cpu_timers[N] all the timers that are firing, and @@ -1169,6 +1164,7 @@ void run_posix_cpu_timers(void) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } + lockdep_posixtimer_exit(); } /* diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index ff0eb30de346..07709ac30439 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -121,7 +121,8 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head, { struct k_itimer *timer; - hlist_for_each_entry_rcu(timer, head, t_hash) { + hlist_for_each_entry_rcu(timer, head, t_hash, + lockdep_is_held(&hash_lock)) { if ((timer->it_signal == sig) && (timer->it_id == id)) return timer; } diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index e4332e3e2d56..fa3f800d7d76 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -208,7 +208,8 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) if (sched_clock_timer.function != NULL) { /* update timeout for clock wrap */ - hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, + HRTIMER_MODE_REL_HARD); } r = rate; @@ -254,9 +255,9 @@ void __init generic_sched_clock_init(void) * Start the timer to keep sched_clock() properly updated and * sets the initial epoch. */ - hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); sched_clock_timer.function = sched_clock_poll; - hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); } /* @@ -293,7 +294,7 @@ void sched_clock_resume(void) struct clock_read_data *rd = &cd.read_data[0]; rd->epoch_cyc = cd.actual_read_sched_clock(); - hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); rd->read_sched_clock = cd.actual_read_sched_clock; } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 7e5d3524e924..6c9c342dd0e5 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -84,13 +84,15 @@ int tick_is_oneshot_available(void) static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); /* Keep track of the next tick event */ tick_next_period = ktime_add(tick_next_period, tick_period); do_timer(1); - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } @@ -162,9 +164,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) ktime_t next; do { - seq = read_seqbegin(&jiffies_lock); + seq = read_seqcount_begin(&jiffies_seq); next = tick_next_period; - } while (read_seqretry(&jiffies_lock, seq)); + } while (read_seqcount_retry(&jiffies_seq, seq)); clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a792d21cac64..3e2dc9b8858c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -65,7 +65,8 @@ static void tick_do_update_jiffies64(ktime_t now) return; /* Reevaluate with jiffies_lock held */ - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, last_jiffies_update); if (delta >= tick_period) { @@ -91,10 +92,12 @@ static void tick_do_update_jiffies64(ktime_t now) /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); } else { - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); return; } - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } @@ -105,12 +108,14 @@ static ktime_t tick_init_jiffy_update(void) { ktime_t period; - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); /* Did we start the jiffies update yet ? */ if (last_jiffies_update == 0) last_jiffies_update = tick_next_period; period = last_jiffies_update; - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); return period; } @@ -240,6 +245,7 @@ static void nohz_full_kick_func(struct irq_work *work) static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { .func = nohz_full_kick_func, + .flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ), }; /* @@ -676,10 +682,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) /* Read jiffies and the time when jiffies were updated last */ do { - seq = read_seqbegin(&jiffies_lock); + seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; - } while (read_seqretry(&jiffies_lock, seq)); + } while (read_seqcount_retry(&jiffies_seq, seq)); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ca69290bee2a..9ebaab13339d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1005,9 +1005,8 @@ static int scale64_check_overflow(u64 mult, u64 div, u64 *base) ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) return -EOVERFLOW; tmp *= mult; - rem *= mult; - do_div(rem, div); + rem = div64_u64(rem * mult, div); *base = tmp + rem; return 0; } @@ -2397,8 +2396,10 @@ EXPORT_SYMBOL(hardpps); */ void xtime_update(unsigned long ticks) { - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); do_timer(ticks); - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 141ab3ab0354..099737f6f10c 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -25,7 +25,8 @@ static inline void sched_clock_resume(void) { } extern void do_timer(unsigned long ticks); extern void update_wall_time(void); -extern seqlock_t jiffies_lock; +extern raw_spinlock_t jiffies_lock; +extern seqcount_t jiffies_seq; #define CS_NAME_LEN 32 diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 568564ae3597..a5221abb4594 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1829,21 +1829,23 @@ static void process_timeout(struct timer_list *t) * schedule_timeout - sleep until timeout * @timeout: timeout value in jiffies * - * Make the current task sleep until @timeout jiffies have - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). + * Make the current task sleep until @timeout jiffies have elapsed. + * The function behavior depends on the current task state + * (see also set_current_state() description): * - * You can set the task state as follows - + * %TASK_RUNNING - the scheduler is called, but the task does not sleep + * at all. That happens because sched_submit_work() does nothing for + * tasks in %TASK_RUNNING state. * * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process())". + * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * - * The current task state is guaranteed to be TASK_RUNNING when this + * The current task state is guaranteed to be %TASK_RUNNING when this * routine returns. * * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule @@ -1851,7 +1853,7 @@ static void process_timeout(struct timer_list *t) * value will be %MAX_SCHEDULE_TIMEOUT. * * Returns 0 when the timer has expired otherwise the remaining time in - * jiffies will be returned. In all cases the return value is guaranteed + * jiffies will be returned. In all cases the return value is guaranteed * to be non-negative. */ signed long __sched schedule_timeout(signed long timeout) diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 9577c89179cd..54ce6eb2ca36 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -71,13 +71,15 @@ void update_vsyscall(struct timekeeper *tk) { struct vdso_data *vdata = __arch_get_k_vdso_data(); struct vdso_timestamp *vdso_ts; + s32 clock_mode; u64 nsec; /* copy vsyscall data */ vdso_write_begin(vdata); - vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk); - vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk); + clock_mode = tk->tkr_mono.clock->vdso_clock_mode; + vdata[CS_HRES_COARSE].clock_mode = clock_mode; + vdata[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; @@ -103,10 +105,10 @@ void update_vsyscall(struct timekeeper *tk) WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); /* - * Architectures can opt out of updating the high resolution part - * of the VDSO. + * If the current clocksource is not VDSO capable, then spare the + * update of the high reolution parts. */ - if (__arch_update_vdso_data()) + if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_data(vdata, tk); __arch_update_vsyscall(vdata, tk); diff --git a/kernel/torture.c b/kernel/torture.c index 8683375dc0c7..a1a41484ff6d 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -101,7 +101,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, torture_type, cpu); starttime = jiffies; (*n_offl_attempts)++; - ret = cpu_down(cpu); + ret = remove_cpu(cpu); if (ret) { s = ""; if (!rcu_inkernel_boot_has_ended() && ret == -EBUSY) { @@ -159,7 +159,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, torture_type, cpu); starttime = jiffies; (*n_onl_attempts)++; - ret = cpu_up(cpu); + ret = add_cpu(cpu); if (ret) { s = ""; if (!rcu_inkernel_boot_has_ended() && ret == -EBUSY) { @@ -209,17 +209,18 @@ torture_onoff(void *arg) for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); - if (!IS_MODULE(CONFIG_TORTURE_TEST)) + if (!IS_MODULE(CONFIG_TORTURE_TEST)) { for_each_possible_cpu(cpu) { if (cpu_online(cpu)) continue; - ret = cpu_up(cpu); + ret = add_cpu(cpu); if (ret && verbose) { pr_alert("%s" TORTURE_FLAG "%s: Initial online %d: errno %d\n", __func__, torture_type, cpu, ret); } } + } if (maxcpu == 0) { VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 402eef84c859..ae69010d521a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -158,6 +158,7 @@ config FUNCTION_TRACER select CONTEXT_SWITCH_TRACER select GLOB select TASKS_RCU if PREEMPTION + select TASKS_RUDE_RCU help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 19e793aa441a..ca1796747a77 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -83,7 +83,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) if (in_nmi()) /* not supported yet */ return 1; - preempt_disable(); + cant_sleep(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { /* @@ -115,11 +115,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) out: __this_cpu_dec(bpf_prog_active); - preempt_enable(); return ret; } -EXPORT_SYMBOL_GPL(trace_call_bpf); #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) @@ -732,7 +730,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) if (unlikely(!nmi_uaccess_okay())) return -EPERM; - if (in_nmi()) { + if (irqs_disabled()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ @@ -781,8 +779,8 @@ static const struct bpf_func_proto bpf_send_signal_thread_proto = { .arg1_type = ARG_ANYTHING, }; -static const struct bpf_func_proto * -tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +const struct bpf_func_proto * +bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -843,6 +841,10 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_send_signal_proto; case BPF_FUNC_send_signal_thread: return &bpf_send_signal_thread_proto; + case BPF_FUNC_perf_event_read_value: + return &bpf_perf_event_read_value_proto; + case BPF_FUNC_get_ns_current_pid_tgid: + return &bpf_get_ns_current_pid_tgid_proto; default: return NULL; } @@ -858,14 +860,12 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_stackid_proto; case BPF_FUNC_get_stack: return &bpf_get_stack_proto; - case BPF_FUNC_perf_event_read_value: - return &bpf_perf_event_read_value_proto; #ifdef CONFIG_BPF_KPROBE_OVERRIDE case BPF_FUNC_override_return: return &bpf_override_return_proto; #endif default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } @@ -975,7 +975,7 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_stack: return &bpf_get_stack_proto_tp; default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } @@ -1028,6 +1028,45 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { .arg3_type = ARG_CONST_SIZE, }; +BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, + void *, buf, u32, size, u64, flags) +{ +#ifndef CONFIG_X86 + return -ENOENT; +#else + static const u32 br_entry_size = sizeof(struct perf_branch_entry); + struct perf_branch_stack *br_stack = ctx->data->br_stack; + u32 to_copy; + + if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE)) + return -EINVAL; + + if (unlikely(!br_stack)) + return -EINVAL; + + if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE) + return br_stack->nr * br_entry_size; + + if (!buf || (size % br_entry_size != 0)) + return -EINVAL; + + to_copy = min_t(u32, br_stack->nr * br_entry_size, size); + memcpy(buf, br_stack->entries, to_copy); + + return to_copy; +#endif +} + +static const struct bpf_func_proto bpf_read_branch_records_proto = { + .func = bpf_read_branch_records, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1040,8 +1079,10 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_stack_proto_tp; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; + case BPF_FUNC_read_branch_records: + return &bpf_read_branch_records_proto; default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } @@ -1104,6 +1145,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { }; extern const struct bpf_func_proto bpf_skb_output_proto; +extern const struct bpf_func_proto bpf_xdp_output_proto; BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) @@ -1168,7 +1210,7 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_stack: return &bpf_get_stack_proto_raw_tp; default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } @@ -1179,6 +1221,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_NET case BPF_FUNC_skb_output: return &bpf_skb_output_proto; + case BPF_FUNC_xdp_output: + return &bpf_xdp_output_proto; #endif default: return raw_tp_prog_func_proto(func_id, prog); @@ -1213,6 +1257,13 @@ static bool tracing_prog_is_valid_access(int off, int size, return btf_ctx_access(off, size, type, prog, info); } +int __weak bpf_prog_test_run_tracing(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { .get_func_proto = raw_tp_prog_func_proto, .is_valid_access = raw_tp_prog_is_valid_access, @@ -1227,6 +1278,7 @@ const struct bpf_verifier_ops tracing_verifier_ops = { }; const struct bpf_prog_ops tracing_prog_ops = { + .test_run = bpf_prog_test_run_tracing, }; static bool raw_tp_writable_prog_is_valid_access(int off, int size, @@ -1475,10 +1527,9 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) static __always_inline void __bpf_trace_run(struct bpf_prog *prog, u64 *args) { + cant_sleep(); rcu_read_lock(); - preempt_disable(); (void) BPF_PROG_RUN(prog, args); - preempt_enable(); rcu_read_unlock(); } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fd81c7de77a7..771eace959f3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -102,7 +102,7 @@ static bool ftrace_pids_enabled(struct ftrace_ops *ops) tr = ops->private; - return tr->function_pids != NULL; + return tr->function_pids != NULL || tr->function_no_pids != NULL; } static void ftrace_update_trampoline(struct ftrace_ops *ops); @@ -139,28 +139,27 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) #endif } +#define FTRACE_PID_IGNORE -1 +#define FTRACE_PID_TRACE -2 + static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { struct trace_array *tr = op->private; + int pid; - if (tr && this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid)) - return; + if (tr) { + pid = this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid); + if (pid == FTRACE_PID_IGNORE) + return; + if (pid != FTRACE_PID_TRACE && + pid != current->pid) + return; + } op->saved_func(ip, parent_ip, op, regs); } -static void ftrace_sync(struct work_struct *work) -{ - /* - * This function is just a stub to implement a hard force - * of synchronize_rcu(). This requires synchronizing - * tasks even in userspace and idle. - * - * Yes, function tracing is rude. - */ -} - static void ftrace_sync_ipi(void *data) { /* Probably not needed, but do it anyway */ @@ -246,7 +245,7 @@ static void update_ftrace_function(void) * Make sure all CPUs see this. Yes this is slow, but static * tracing is slow and nasty to have enabled. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); /* Now all cpus are using the list ops. */ function_trace_op = set_function_trace_op; /* Make sure the function_trace_op is visible on all CPUs */ @@ -2922,7 +2921,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); /* * When the kernel is preeptive, tasks can be preempted @@ -5877,7 +5876,7 @@ ftrace_graph_release(struct inode *inode, struct file *file) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); free_ftrace_hash(old_hash); } @@ -6923,11 +6922,17 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, { struct trace_array *tr = data; struct trace_pid_list *pid_list; + struct trace_pid_list *no_pid_list; pid_list = rcu_dereference_sched(tr->function_pids); + no_pid_list = rcu_dereference_sched(tr->function_no_pids); - this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, - trace_ignore_this_task(pid_list, next)); + if (trace_ignore_this_task(pid_list, no_pid_list, next)) + this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, + FTRACE_PID_IGNORE); + else + this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, + next->pid); } static void @@ -6940,6 +6945,9 @@ ftrace_pid_follow_sched_process_fork(void *data, pid_list = rcu_dereference_sched(tr->function_pids); trace_filter_add_remove_task(pid_list, self, task); + + pid_list = rcu_dereference_sched(tr->function_no_pids); + trace_filter_add_remove_task(pid_list, self, task); } static void @@ -6950,6 +6958,9 @@ ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task) pid_list = rcu_dereference_sched(tr->function_pids); trace_filter_add_remove_task(pid_list, NULL, task); + + pid_list = rcu_dereference_sched(tr->function_no_pids); + trace_filter_add_remove_task(pid_list, NULL, task); } void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) @@ -6967,42 +6978,57 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) } } -static void clear_ftrace_pids(struct trace_array *tr) +static void clear_ftrace_pids(struct trace_array *tr, int type) { struct trace_pid_list *pid_list; + struct trace_pid_list *no_pid_list; int cpu; pid_list = rcu_dereference_protected(tr->function_pids, lockdep_is_held(&ftrace_lock)); - if (!pid_list) + no_pid_list = rcu_dereference_protected(tr->function_no_pids, + lockdep_is_held(&ftrace_lock)); + + /* Make sure there's something to do */ + if (!pid_type_enabled(type, pid_list, no_pid_list)) return; - unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); + /* See if the pids still need to be checked after this */ + if (!still_need_pid_events(type, pid_list, no_pid_list)) { + unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); + for_each_possible_cpu(cpu) + per_cpu_ptr(tr->array_buffer.data, cpu)->ftrace_ignore_pid = FTRACE_PID_TRACE; + } - for_each_possible_cpu(cpu) - per_cpu_ptr(tr->array_buffer.data, cpu)->ftrace_ignore_pid = false; + if (type & TRACE_PIDS) + rcu_assign_pointer(tr->function_pids, NULL); - rcu_assign_pointer(tr->function_pids, NULL); + if (type & TRACE_NO_PIDS) + rcu_assign_pointer(tr->function_no_pids, NULL); /* Wait till all users are no longer using pid filtering */ synchronize_rcu(); - trace_free_pid_list(pid_list); + if ((type & TRACE_PIDS) && pid_list) + trace_free_pid_list(pid_list); + + if ((type & TRACE_NO_PIDS) && no_pid_list) + trace_free_pid_list(no_pid_list); } void ftrace_clear_pids(struct trace_array *tr) { mutex_lock(&ftrace_lock); - clear_ftrace_pids(tr); + clear_ftrace_pids(tr, TRACE_PIDS | TRACE_NO_PIDS); mutex_unlock(&ftrace_lock); } -static void ftrace_pid_reset(struct trace_array *tr) +static void ftrace_pid_reset(struct trace_array *tr, int type) { mutex_lock(&ftrace_lock); - clear_ftrace_pids(tr); + clear_ftrace_pids(tr, type); ftrace_update_pid_func(); ftrace_startup_all(0); @@ -7066,9 +7092,45 @@ static const struct seq_operations ftrace_pid_sops = { .show = fpid_show, }; -static int -ftrace_pid_open(struct inode *inode, struct file *file) +static void *fnpid_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) { + struct trace_pid_list *pid_list; + struct trace_array *tr = m->private; + + mutex_lock(&ftrace_lock); + rcu_read_lock_sched(); + + pid_list = rcu_dereference_sched(tr->function_no_pids); + + if (!pid_list) + return !(*pos) ? FTRACE_NO_PIDS : NULL; + + return trace_pid_start(pid_list, pos); +} + +static void *fnpid_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_no_pids); + + if (v == FTRACE_NO_PIDS) { + (*pos)++; + return NULL; + } + return trace_pid_next(pid_list, v, pos); +} + +static const struct seq_operations ftrace_no_pid_sops = { + .start = fnpid_start, + .next = fnpid_next, + .stop = fpid_stop, + .show = fpid_show, +}; + +static int pid_open(struct inode *inode, struct file *file, int type) +{ + const struct seq_operations *seq_ops; struct trace_array *tr = inode->i_private; struct seq_file *m; int ret = 0; @@ -7079,9 +7141,18 @@ ftrace_pid_open(struct inode *inode, struct file *file) if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_pid_reset(tr); + ftrace_pid_reset(tr, type); + + switch (type) { + case TRACE_PIDS: + seq_ops = &ftrace_pid_sops; + break; + case TRACE_NO_PIDS: + seq_ops = &ftrace_no_pid_sops; + break; + } - ret = seq_open(file, &ftrace_pid_sops); + ret = seq_open(file, seq_ops); if (ret < 0) { trace_array_put(tr); } else { @@ -7093,10 +7164,23 @@ ftrace_pid_open(struct inode *inode, struct file *file) return ret; } +static int +ftrace_pid_open(struct inode *inode, struct file *file) +{ + return pid_open(inode, file, TRACE_PIDS); +} + +static int +ftrace_no_pid_open(struct inode *inode, struct file *file) +{ + return pid_open(inode, file, TRACE_NO_PIDS); +} + static void ignore_task_cpu(void *data) { struct trace_array *tr = data; struct trace_pid_list *pid_list; + struct trace_pid_list *no_pid_list; /* * This function is called by on_each_cpu() while the @@ -7104,18 +7188,25 @@ static void ignore_task_cpu(void *data) */ pid_list = rcu_dereference_protected(tr->function_pids, mutex_is_locked(&ftrace_lock)); + no_pid_list = rcu_dereference_protected(tr->function_no_pids, + mutex_is_locked(&ftrace_lock)); - this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, - trace_ignore_this_task(pid_list, current)); + if (trace_ignore_this_task(pid_list, no_pid_list, current)) + this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, + FTRACE_PID_IGNORE); + else + this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, + current->pid); } static ssize_t -ftrace_pid_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, int type) { struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; - struct trace_pid_list *filtered_pids = NULL; + struct trace_pid_list *filtered_pids; + struct trace_pid_list *other_pids; struct trace_pid_list *pid_list; ssize_t ret; @@ -7124,19 +7215,39 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, mutex_lock(&ftrace_lock); - filtered_pids = rcu_dereference_protected(tr->function_pids, + switch (type) { + case TRACE_PIDS: + filtered_pids = rcu_dereference_protected(tr->function_pids, + lockdep_is_held(&ftrace_lock)); + other_pids = rcu_dereference_protected(tr->function_no_pids, + lockdep_is_held(&ftrace_lock)); + break; + case TRACE_NO_PIDS: + filtered_pids = rcu_dereference_protected(tr->function_no_pids, + lockdep_is_held(&ftrace_lock)); + other_pids = rcu_dereference_protected(tr->function_pids, lockdep_is_held(&ftrace_lock)); + break; + } ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) goto out; - rcu_assign_pointer(tr->function_pids, pid_list); + switch (type) { + case TRACE_PIDS: + rcu_assign_pointer(tr->function_pids, pid_list); + break; + case TRACE_NO_PIDS: + rcu_assign_pointer(tr->function_no_pids, pid_list); + break; + } + if (filtered_pids) { synchronize_rcu(); trace_free_pid_list(filtered_pids); - } else if (pid_list) { + } else if (pid_list && !other_pids) { /* Register a probe to set whether to ignore the tracing of a task */ register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); } @@ -7159,6 +7270,20 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, return ret; } +static ssize_t +ftrace_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return pid_write(filp, ubuf, cnt, ppos, TRACE_PIDS); +} + +static ssize_t +ftrace_no_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return pid_write(filp, ubuf, cnt, ppos, TRACE_NO_PIDS); +} + static int ftrace_pid_release(struct inode *inode, struct file *file) { @@ -7177,10 +7302,20 @@ static const struct file_operations ftrace_pid_fops = { .release = ftrace_pid_release, }; +static const struct file_operations ftrace_no_pid_fops = { + .open = ftrace_no_pid_open, + .write = ftrace_no_pid_write, + .read = seq_read, + .llseek = tracing_lseek, + .release = ftrace_pid_release, +}; + void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) { trace_create_file("set_ftrace_pid", 0644, d_tracer, tr, &ftrace_pid_fops); + trace_create_file("set_ftrace_notrace_pid", 0644, d_tracer, + tr, &ftrace_no_pid_fops); } void __init ftrace_init_tracefs_toplevel(struct trace_array *tr, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 61f0e92ace99..6f0b42ceeb00 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -441,6 +441,7 @@ enum { struct ring_buffer_per_cpu { int cpu; atomic_t record_disabled; + atomic_t resize_disabled; struct trace_buffer *buffer; raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; @@ -484,7 +485,6 @@ struct trace_buffer { unsigned flags; int cpus; atomic_t record_disabled; - atomic_t resize_disabled; cpumask_var_t cpumask; struct lock_class_key *reader_lock_key; @@ -503,10 +503,14 @@ struct trace_buffer { struct ring_buffer_iter { struct ring_buffer_per_cpu *cpu_buffer; unsigned long head; + unsigned long next_event; struct buffer_page *head_page; struct buffer_page *cache_reader_page; unsigned long cache_read; u64 read_stamp; + u64 page_stamp; + struct ring_buffer_event *event; + int missed_events; }; /** @@ -1737,18 +1741,24 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, size = nr_pages * BUF_PAGE_SIZE; - /* - * Don't succeed if resizing is disabled, as a reader might be - * manipulating the ring buffer and is expecting a sane state while - * this is true. - */ - if (atomic_read(&buffer->resize_disabled)) - return -EBUSY; - /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); + if (cpu_id == RING_BUFFER_ALL_CPUS) { + /* + * Don't succeed if resizing is disabled, as a reader might be + * manipulating the ring buffer and is expecting a sane state while + * this is true. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + if (atomic_read(&cpu_buffer->resize_disabled)) { + err = -EBUSY; + goto out_err_unlock; + } + } + /* calculate the pages to update */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; @@ -1816,6 +1826,16 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (nr_pages == cpu_buffer->nr_pages) goto out; + /* + * Don't succeed if resizing is disabled, as a reader might be + * manipulating the ring buffer and is expecting a sane state while + * this is true. + */ + if (atomic_read(&cpu_buffer->resize_disabled)) { + err = -EBUSY; + goto out_err_unlock; + } + cpu_buffer->nr_pages_to_update = nr_pages - cpu_buffer->nr_pages; @@ -1885,6 +1905,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, free_buffer_page(bpage); } } + out_err_unlock: mutex_unlock(&buffer->mutex); return err; } @@ -1913,15 +1934,63 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read); } -static __always_inline struct ring_buffer_event * -rb_iter_head_event(struct ring_buffer_iter *iter) +static __always_inline unsigned rb_page_commit(struct buffer_page *bpage) { - return __rb_page_index(iter->head_page, iter->head); + return local_read(&bpage->page->commit); } -static __always_inline unsigned rb_page_commit(struct buffer_page *bpage) +static struct ring_buffer_event * +rb_iter_head_event(struct ring_buffer_iter *iter) { - return local_read(&bpage->page->commit); + struct ring_buffer_event *event; + struct buffer_page *iter_head_page = iter->head_page; + unsigned long commit; + unsigned length; + + if (iter->head != iter->next_event) + return iter->event; + + /* + * When the writer goes across pages, it issues a cmpxchg which + * is a mb(), which will synchronize with the rmb here. + * (see rb_tail_page_update() and __rb_reserve_next()) + */ + commit = rb_page_commit(iter_head_page); + smp_rmb(); + event = __rb_page_index(iter_head_page, iter->head); + length = rb_event_length(event); + + /* + * READ_ONCE() doesn't work on functions and we don't want the + * compiler doing any crazy optimizations with length. + */ + barrier(); + + if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE) + /* Writer corrupted the read? */ + goto reset; + + memcpy(iter->event, event, length); + /* + * If the page stamp is still the same after this rmb() then the + * event was safely copied without the writer entering the page. + */ + smp_rmb(); + + /* Make sure the page didn't change since we read this */ + if (iter->page_stamp != iter_head_page->page->time_stamp || + commit > rb_page_commit(iter_head_page)) + goto reset; + + iter->next_event = iter->head + length; + return iter->event; + reset: + /* Reset to the beginning */ + iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; + iter->head = 0; + iter->next_event = 0; + iter->missed_events = 1; + return NULL; } /* Size is determined by what has been committed */ @@ -1959,8 +2028,9 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) else rb_inc_page(cpu_buffer, &iter->head_page); - iter->read_stamp = iter->head_page->page->time_stamp; + iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; + iter->next_event = 0; } /* @@ -3547,14 +3617,18 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) /* Iterator usage is expected to have record disabled */ iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; + iter->next_event = iter->head; iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; - if (iter->head) + if (iter->head) { iter->read_stamp = cpu_buffer->read_stamp; - else + iter->page_stamp = cpu_buffer->reader_page->page->time_stamp; + } else { iter->read_stamp = iter->head_page->page->time_stamp; + iter->page_stamp = iter->read_stamp; + } } /** @@ -3590,17 +3664,38 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) struct buffer_page *reader; struct buffer_page *head_page; struct buffer_page *commit_page; + struct buffer_page *curr_commit_page; unsigned commit; + u64 curr_commit_ts; + u64 commit_ts; cpu_buffer = iter->cpu_buffer; - - /* Remember, trace recording is off when iterator is in use */ reader = cpu_buffer->reader_page; head_page = cpu_buffer->head_page; commit_page = cpu_buffer->commit_page; + commit_ts = commit_page->page->time_stamp; + + /* + * When the writer goes across pages, it issues a cmpxchg which + * is a mb(), which will synchronize with the rmb here. + * (see rb_tail_page_update()) + */ + smp_rmb(); commit = rb_page_commit(commit_page); + /* We want to make sure that the commit page doesn't change */ + smp_rmb(); - return ((iter->head_page == commit_page && iter->head == commit) || + /* Make sure commit page didn't change */ + curr_commit_page = READ_ONCE(cpu_buffer->commit_page); + curr_commit_ts = READ_ONCE(curr_commit_page->page->time_stamp); + + /* If the commit page changed, then there's more data */ + if (curr_commit_page != commit_page || + curr_commit_ts != commit_ts) + return 0; + + /* Still racy, as it may return a false positive, but that's OK */ + return ((iter->head_page == commit_page && iter->head >= commit) || (iter->head_page == reader && commit_page == head_page && head_page->read == commit && iter->head == rb_page_commit(cpu_buffer->reader_page))); @@ -3828,15 +3923,22 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) static void rb_advance_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_event *event; - unsigned length; cpu_buffer = iter->cpu_buffer; + /* If head == next_event then we need to jump to the next event */ + if (iter->head == iter->next_event) { + /* If the event gets overwritten again, there's nothing to do */ + if (rb_iter_head_event(iter) == NULL) + return; + } + + iter->head = iter->next_event; + /* * Check if we are at the end of the buffer. */ - if (iter->head >= rb_page_size(iter->head_page)) { + if (iter->next_event >= rb_page_size(iter->head_page)) { /* discarded commits can make the page empty */ if (iter->head_page == cpu_buffer->commit_page) return; @@ -3844,27 +3946,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) return; } - event = rb_iter_head_event(iter); - - length = rb_event_length(event); - - /* - * This should not be called to advance the header if we are - * at the tail of the buffer. - */ - if (RB_WARN_ON(cpu_buffer, - (iter->head_page == cpu_buffer->commit_page) && - (iter->head + length > rb_commit_index(cpu_buffer)))) - return; - - rb_update_iter_read_stamp(iter, event); - - iter->head += length; - - /* check for end of page padding */ - if ((iter->head >= rb_page_size(iter->head_page)) && - (iter->head_page != cpu_buffer->commit_page)) - rb_inc_iter(iter); + rb_update_iter_read_stamp(iter, iter->event); } static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) @@ -3952,6 +4034,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; int nr_loops = 0; + bool failed = false; if (ts) *ts = 0; @@ -3978,10 +4061,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) * to a data event, we should never loop more than three times. * Once for going to next page, once on time extend, and * finally once to get the event. - * (We never hit the following condition more than thrice). + * We should never hit the following condition more than thrice, + * unless the buffer is very small, and there's a writer + * that is causing the reader to fail getting an event. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) + if (++nr_loops > 3) { + RB_WARN_ON(cpu_buffer, !failed); return NULL; + } if (rb_per_cpu_empty(cpu_buffer)) return NULL; @@ -3992,6 +4079,10 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) } event = rb_iter_head_event(iter); + if (!event) { + failed = true; + goto again; + } switch (event->type_len) { case RINGBUF_TYPE_PADDING: @@ -4102,6 +4193,20 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, return event; } +/** ring_buffer_iter_dropped - report if there are dropped events + * @iter: The ring buffer iterator + * + * Returns true if there was dropped events since the last peek. + */ +bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter) +{ + bool ret = iter->missed_events != 0; + + iter->missed_events = 0; + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped); + /** * ring_buffer_iter_peek - peek at the next event to be read * @iter: The ring buffer iterator @@ -4208,16 +4313,21 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; - iter = kmalloc(sizeof(*iter), flags); + iter = kzalloc(sizeof(*iter), flags); if (!iter) return NULL; + iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags); + if (!iter->event) { + kfree(iter); + return NULL; + } + cpu_buffer = buffer->buffers[cpu]; iter->cpu_buffer = cpu_buffer; - atomic_inc(&buffer->resize_disabled); - atomic_inc(&cpu_buffer->record_disabled); + atomic_inc(&cpu_buffer->resize_disabled); return iter; } @@ -4290,42 +4400,31 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter) rb_check_pages(cpu_buffer); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - atomic_dec(&cpu_buffer->record_disabled); - atomic_dec(&cpu_buffer->buffer->resize_disabled); + atomic_dec(&cpu_buffer->resize_disabled); + kfree(iter->event); kfree(iter); } EXPORT_SYMBOL_GPL(ring_buffer_read_finish); /** - * ring_buffer_read - read the next item in the ring buffer by the iterator + * ring_buffer_iter_advance - advance the iterator to the next location * @iter: The ring buffer iterator - * @ts: The time stamp of the event read. * - * This reads the next event in the ring buffer and increments the iterator. + * Move the location of the iterator such that the next read will + * be the next location of the iterator. */ -struct ring_buffer_event * -ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) +void ring_buffer_iter_advance(struct ring_buffer_iter *iter) { - struct ring_buffer_event *event; struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - again: - event = rb_iter_peek(iter, ts); - if (!event) - goto out; - - if (event->type_len == RINGBUF_TYPE_PADDING) - goto again; rb_advance_iter(iter); - out: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - return event; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } -EXPORT_SYMBOL_GPL(ring_buffer_read); +EXPORT_SYMBOL_GPL(ring_buffer_iter_advance); /** * ring_buffer_size - return the size of the ring buffer (in bytes) @@ -4406,7 +4505,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; - atomic_inc(&buffer->resize_disabled); + atomic_inc(&cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); /* Make sure all commits have finished */ @@ -4427,7 +4526,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); - atomic_dec(&buffer->resize_disabled); + atomic_dec(&cpu_buffer->resize_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6b11e4e2150c..8d2b98812625 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -386,16 +386,22 @@ trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) * Returns false if @task should be traced. */ bool -trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) +trace_ignore_this_task(struct trace_pid_list *filtered_pids, + struct trace_pid_list *filtered_no_pids, + struct task_struct *task) { /* - * Return false, because if filtered_pids does not exist, - * all pids are good to trace. + * If filterd_no_pids is not empty, and the task's pid is listed + * in filtered_no_pids, then return true. + * Otherwise, if filtered_pids is empty, that means we can + * trace all tasks. If it has content, then only trace pids + * within filtered_pids. */ - if (!filtered_pids) - return false; - return !trace_find_filtered_pid(filtered_pids, task->pid); + return (filtered_pids && + !trace_find_filtered_pid(filtered_pids, task->pid)) || + (filtered_no_pids && + trace_find_filtered_pid(filtered_no_pids, task->pid)); } /** @@ -3378,7 +3384,7 @@ static void trace_iterator_increment(struct trace_iterator *iter) iter->idx++; if (buf_iter) - ring_buffer_read(buf_iter, NULL); + ring_buffer_iter_advance(buf_iter); } static struct trace_entry * @@ -3388,11 +3394,15 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); - if (buf_iter) + if (buf_iter) { event = ring_buffer_iter_peek(buf_iter, ts); - else + if (lost_events) + *lost_events = ring_buffer_iter_dropped(buf_iter) ? + (unsigned long)-1 : 0; + } else { event = ring_buffer_peek(iter->array_buffer->buffer, cpu, ts, lost_events); + } if (event) { iter->ent_size = ring_buffer_event_length(event); @@ -3462,11 +3472,51 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, return next; } +#define STATIC_TEMP_BUF_SIZE 128 +static char static_temp_buf[STATIC_TEMP_BUF_SIZE]; + /* Find the next real entry, without updating the iterator itself */ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { - return __find_next_entry(iter, ent_cpu, NULL, ent_ts); + /* __find_next_entry will reset ent_size */ + int ent_size = iter->ent_size; + struct trace_entry *entry; + + /* + * If called from ftrace_dump(), then the iter->temp buffer + * will be the static_temp_buf and not created from kmalloc. + * If the entry size is greater than the buffer, we can + * not save it. Just return NULL in that case. This is only + * used to add markers when two consecutive events' time + * stamps have a large delta. See trace_print_lat_context() + */ + if (iter->temp == static_temp_buf && + STATIC_TEMP_BUF_SIZE < ent_size) + return NULL; + + /* + * The __find_next_entry() may call peek_next_entry(), which may + * call ring_buffer_peek() that may make the contents of iter->ent + * undefined. Need to copy iter->ent now. + */ + if (iter->ent && iter->ent != iter->temp) { + if ((!iter->temp || iter->temp_size < iter->ent_size) && + !WARN_ON_ONCE(iter->temp == static_temp_buf)) { + kfree(iter->temp); + iter->temp = kmalloc(iter->ent_size, GFP_KERNEL); + if (!iter->temp) + return NULL; + } + memcpy(iter->temp, iter->ent, iter->ent_size); + iter->temp_size = iter->ent_size; + iter->ent = iter->temp; + } + entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts); + /* Put back the original ent_size */ + iter->ent_size = ent_size; + + return entry; } /* Find the next real entry, and increment the iterator to the next entry */ @@ -3538,7 +3588,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) if (ts >= iter->array_buffer->time_start) break; entries++; - ring_buffer_read(buf_iter, NULL); + ring_buffer_iter_advance(buf_iter); } per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries; @@ -3981,8 +4031,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) enum print_line_t ret; if (iter->lost_events) { - trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", - iter->cpu, iter->lost_events); + if (iter->lost_events == (unsigned long)-1) + trace_seq_printf(&iter->seq, "CPU:%d [LOST EVENTS]\n", + iter->cpu); + else + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events); if (trace_seq_has_overflowed(&iter->seq)) return TRACE_TYPE_PARTIAL_LINE; } @@ -4198,6 +4252,18 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) goto release; /* + * trace_find_next_entry() may need to save off iter->ent. + * It will place it into the iter->temp buffer. As most + * events are less than 128, allocate a buffer of that size. + * If one is greater, then trace_find_next_entry() will + * allocate a new buffer to adjust for the bigger iter->ent. + * It's not critical if it fails to get allocated here. + */ + iter->temp = kmalloc(128, GFP_KERNEL); + if (iter->temp) + iter->temp_size = 128; + + /* * We make a copy of the current tracer to avoid concurrent * changes on it while we are reading. */ @@ -4237,8 +4303,11 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; - /* stop the trace while dumping if we are not opening "snapshot" */ - if (!iter->snapshot) + /* + * If pause-on-trace is enabled, then stop the trace while + * dumping, unless this is the "snapshot" file + */ + if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE)) tracing_stop_tr(tr); if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { @@ -4269,6 +4338,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) fail: mutex_unlock(&trace_types_lock); kfree(iter->trace); + kfree(iter->temp); kfree(iter->buffer_iter); release: seq_release_private(inode, file); @@ -4334,7 +4404,7 @@ static int tracing_release(struct inode *inode, struct file *file) if (iter->trace && iter->trace->close) iter->trace->close(iter); - if (!iter->snapshot) + if (!iter->snapshot && tr->stop_count) /* reenable tracing if it was previously enabled */ tracing_start_tr(tr); @@ -4344,6 +4414,7 @@ static int tracing_release(struct inode *inode, struct file *file) mutex_destroy(&iter->mutex); free_cpumask_var(iter->started); + kfree(iter->temp); kfree(iter->trace); kfree(iter->buffer_iter); seq_release_private(inode, file); @@ -4964,6 +5035,8 @@ static const char readme_msg[] = #ifdef CONFIG_FUNCTION_TRACER " set_ftrace_pid\t- Write pid(s) to only function trace those pids\n" "\t\t (function)\n" + " set_ftrace_notrace_pid\t- Write pid(s) to not function trace those pids\n" + "\t\t (function)\n" #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" @@ -9146,6 +9219,9 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) /* Simulate the iterator */ trace_init_global_iter(&iter); + /* Can not use kmalloc for iter.temp */ + iter.temp = static_temp_buf; + iter.temp_size = STATIC_TEMP_BUF_SIZE; for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); @@ -9334,7 +9410,7 @@ __init static int tracer_alloc_buffers(void) goto out_free_buffer_mask; /* Only allocate trace_printk buffers if a trace_printk exists */ - if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) + if (&__stop___trace_bprintk_fmt != &__start___trace_bprintk_fmt) /* Must be called before global_trace.buffer is allocated */ trace_printk_init_buffers(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 99372dd7d168..4eb1d004d5f2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -178,10 +178,10 @@ struct trace_array_cpu { kuid_t uid; char comm[TASK_COMM_LEN]; - bool ignore_pid; #ifdef CONFIG_FUNCTION_TRACER - bool ftrace_ignore_pid; + int ftrace_ignore_pid; #endif + bool ignore_pid; }; struct tracer; @@ -207,6 +207,30 @@ struct trace_pid_list { unsigned long *pids; }; +enum { + TRACE_PIDS = BIT(0), + TRACE_NO_PIDS = BIT(1), +}; + +static inline bool pid_type_enabled(int type, struct trace_pid_list *pid_list, + struct trace_pid_list *no_pid_list) +{ + /* Return true if the pid list in type has pids */ + return ((type & TRACE_PIDS) && pid_list) || + ((type & TRACE_NO_PIDS) && no_pid_list); +} + +static inline bool still_need_pid_events(int type, struct trace_pid_list *pid_list, + struct trace_pid_list *no_pid_list) +{ + /* + * Turning off what is in @type, return true if the "other" + * pid list, still has pids in it. + */ + return (!(type & TRACE_PIDS) && pid_list) || + (!(type & TRACE_NO_PIDS) && no_pid_list); +} + typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); /** @@ -285,6 +309,7 @@ struct trace_array { #endif #endif struct trace_pid_list __rcu *filtered_pids; + struct trace_pid_list __rcu *filtered_no_pids; /* * max_lock is used to protect the swapping of buffers * when taking a max snapshot. The buffers themselves are @@ -331,6 +356,7 @@ struct trace_array { #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; + struct trace_pid_list __rcu *function_no_pids; #ifdef CONFIG_DYNAMIC_FTRACE /* All of these are protected by the ftrace_lock */ struct list_head func_probes; @@ -557,12 +583,7 @@ struct tracer { * caller, and we can skip the current check. */ enum { - TRACE_BUFFER_BIT, - TRACE_BUFFER_NMI_BIT, - TRACE_BUFFER_IRQ_BIT, - TRACE_BUFFER_SIRQ_BIT, - - /* Start of function recursion bits */ + /* Function recursion bits */ TRACE_FTRACE_BIT, TRACE_FTRACE_NMI_BIT, TRACE_FTRACE_IRQ_BIT, @@ -787,6 +808,7 @@ extern int pid_max; bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid); bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, + struct trace_pid_list *filtered_no_pids, struct task_struct *task); void trace_filter_add_remove_task(struct trace_pid_list *pid_list, struct task_struct *self, @@ -1307,6 +1329,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(IRQ_INFO, "irq-info"), \ C(MARKERS, "markers"), \ C(EVENT_FORK, "event-fork"), \ + C(PAUSE_ON_TRACE, "pause-on-trace"), \ FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index f22746f3c132..a523da0dae0a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -325,14 +325,16 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __field_desc( long, timestamp, tv_nsec ) __field( unsigned int, nmi_count ) __field( unsigned int, seqnum ) + __field( unsigned int, count ) ), - F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tnmi-ts:%llu\tnmi-count:%u\n", + F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tcount:%d\tnmi-ts:%llu\tnmi-count:%u\n", __entry->seqnum, __entry->tv_sec, __entry->tv_nsec, __entry->duration, __entry->outer_duration, + __entry->count, __entry->nmi_total_ts, __entry->nmi_count) ); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f38234ecea18..242f59e7f17d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -232,10 +232,13 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) { struct trace_array *tr = trace_file->tr; struct trace_array_cpu *data; + struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; pid_list = rcu_dereference_raw(tr->filtered_pids); - if (!pid_list) + no_pid_list = rcu_dereference_raw(tr->filtered_no_pids); + + if (!pid_list && !no_pid_list) return false; data = this_cpu_ptr(tr->array_buffer.data); @@ -510,6 +513,9 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task) pid_list = rcu_dereference_raw(tr->filtered_pids); trace_filter_add_remove_task(pid_list, NULL, task); + + pid_list = rcu_dereference_raw(tr->filtered_no_pids); + trace_filter_add_remove_task(pid_list, NULL, task); } static void @@ -522,6 +528,9 @@ event_filter_pid_sched_process_fork(void *data, pid_list = rcu_dereference_sched(tr->filtered_pids); trace_filter_add_remove_task(pid_list, self, task); + + pid_list = rcu_dereference_sched(tr->filtered_no_pids); + trace_filter_add_remove_task(pid_list, self, task); } void trace_event_follow_fork(struct trace_array *tr, bool enable) @@ -544,13 +553,23 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, struct task_struct *prev, struct task_struct *next) { struct trace_array *tr = data; + struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; + bool ret; pid_list = rcu_dereference_sched(tr->filtered_pids); + no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); - this_cpu_write(tr->array_buffer.data->ignore_pid, - trace_ignore_this_task(pid_list, prev) && - trace_ignore_this_task(pid_list, next)); + /* + * Sched switch is funny, as we only want to ignore it + * in the notrace case if both prev and next should be ignored. + */ + ret = trace_ignore_this_task(NULL, no_pid_list, prev) && + trace_ignore_this_task(NULL, no_pid_list, next); + + this_cpu_write(tr->array_buffer.data->ignore_pid, ret || + (trace_ignore_this_task(pid_list, NULL, prev) && + trace_ignore_this_task(pid_list, NULL, next))); } static void @@ -558,18 +577,21 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt, struct task_struct *prev, struct task_struct *next) { struct trace_array *tr = data; + struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; pid_list = rcu_dereference_sched(tr->filtered_pids); + no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); this_cpu_write(tr->array_buffer.data->ignore_pid, - trace_ignore_this_task(pid_list, next)); + trace_ignore_this_task(pid_list, no_pid_list, next)); } static void event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) { struct trace_array *tr = data; + struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; /* Nothing to do if we are already tracing */ @@ -577,15 +599,17 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) return; pid_list = rcu_dereference_sched(tr->filtered_pids); + no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); this_cpu_write(tr->array_buffer.data->ignore_pid, - trace_ignore_this_task(pid_list, task)); + trace_ignore_this_task(pid_list, no_pid_list, task)); } static void event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) { struct trace_array *tr = data; + struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; /* Nothing to do if we are not tracing */ @@ -593,23 +617,15 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) return; pid_list = rcu_dereference_sched(tr->filtered_pids); + no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); /* Set tracing if current is enabled */ this_cpu_write(tr->array_buffer.data->ignore_pid, - trace_ignore_this_task(pid_list, current)); + trace_ignore_this_task(pid_list, no_pid_list, current)); } -static void __ftrace_clear_event_pids(struct trace_array *tr) +static void unregister_pid_events(struct trace_array *tr) { - struct trace_pid_list *pid_list; - struct trace_event_file *file; - int cpu; - - pid_list = rcu_dereference_protected(tr->filtered_pids, - lockdep_is_held(&event_mutex)); - if (!pid_list) - return; - unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr); unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr); @@ -621,26 +637,55 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr); +} - list_for_each_entry(file, &tr->events, list) { - clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); +static void __ftrace_clear_event_pids(struct trace_array *tr, int type) +{ + struct trace_pid_list *pid_list; + struct trace_pid_list *no_pid_list; + struct trace_event_file *file; + int cpu; + + pid_list = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, + lockdep_is_held(&event_mutex)); + + /* Make sure there's something to do */ + if (!pid_type_enabled(type, pid_list, no_pid_list)) + return; + + if (!still_need_pid_events(type, pid_list, no_pid_list)) { + unregister_pid_events(tr); + + list_for_each_entry(file, &tr->events, list) { + clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); + } + + for_each_possible_cpu(cpu) + per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false; } - for_each_possible_cpu(cpu) - per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false; + if (type & TRACE_PIDS) + rcu_assign_pointer(tr->filtered_pids, NULL); - rcu_assign_pointer(tr->filtered_pids, NULL); + if (type & TRACE_NO_PIDS) + rcu_assign_pointer(tr->filtered_no_pids, NULL); /* Wait till all users are no longer using pid filtering */ tracepoint_synchronize_unregister(); - trace_free_pid_list(pid_list); + if ((type & TRACE_PIDS) && pid_list) + trace_free_pid_list(pid_list); + + if ((type & TRACE_NO_PIDS) && no_pid_list) + trace_free_pid_list(no_pid_list); } -static void ftrace_clear_event_pids(struct trace_array *tr) +static void ftrace_clear_event_pids(struct trace_array *tr, int type) { mutex_lock(&event_mutex); - __ftrace_clear_event_pids(tr); + __ftrace_clear_event_pids(tr, type); mutex_unlock(&event_mutex); } @@ -1013,15 +1058,32 @@ static void t_stop(struct seq_file *m, void *p) } static void * -p_next(struct seq_file *m, void *v, loff_t *pos) +__next(struct seq_file *m, void *v, loff_t *pos, int type) { struct trace_array *tr = m->private; - struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); + struct trace_pid_list *pid_list; + + if (type == TRACE_PIDS) + pid_list = rcu_dereference_sched(tr->filtered_pids); + else + pid_list = rcu_dereference_sched(tr->filtered_no_pids); return trace_pid_next(pid_list, v, pos); } -static void *p_start(struct seq_file *m, loff_t *pos) +static void * +p_next(struct seq_file *m, void *v, loff_t *pos) +{ + return __next(m, v, pos, TRACE_PIDS); +} + +static void * +np_next(struct seq_file *m, void *v, loff_t *pos) +{ + return __next(m, v, pos, TRACE_NO_PIDS); +} + +static void *__start(struct seq_file *m, loff_t *pos, int type) __acquires(RCU) { struct trace_pid_list *pid_list; @@ -1036,7 +1098,10 @@ static void *p_start(struct seq_file *m, loff_t *pos) mutex_lock(&event_mutex); rcu_read_lock_sched(); - pid_list = rcu_dereference_sched(tr->filtered_pids); + if (type == TRACE_PIDS) + pid_list = rcu_dereference_sched(tr->filtered_pids); + else + pid_list = rcu_dereference_sched(tr->filtered_no_pids); if (!pid_list) return NULL; @@ -1044,6 +1109,18 @@ static void *p_start(struct seq_file *m, loff_t *pos) return trace_pid_start(pid_list, pos); } +static void *p_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) +{ + return __start(m, pos, TRACE_PIDS); +} + +static void *np_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) +{ + return __start(m, pos, TRACE_NO_PIDS); +} + static void p_stop(struct seq_file *m, void *p) __releases(RCU) { @@ -1588,6 +1665,7 @@ static void ignore_task_cpu(void *data) { struct trace_array *tr = data; struct trace_pid_list *pid_list; + struct trace_pid_list *no_pid_list; /* * This function is called by on_each_cpu() while the @@ -1595,18 +1673,50 @@ static void ignore_task_cpu(void *data) */ pid_list = rcu_dereference_protected(tr->filtered_pids, mutex_is_locked(&event_mutex)); + no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, + mutex_is_locked(&event_mutex)); this_cpu_write(tr->array_buffer.data->ignore_pid, - trace_ignore_this_task(pid_list, current)); + trace_ignore_this_task(pid_list, no_pid_list, current)); +} + +static void register_pid_events(struct trace_array *tr) +{ + /* + * Register a probe that is called before all other probes + * to set ignore_pid if next or prev do not match. + * Register a probe this is called after all other probes + * to only keep ignore_pid set if next pid matches. + */ + register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post, + tr, 0); + + register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, + tr, 0); + + register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, + tr, 0); + + register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post, + tr, 0); } static ssize_t -ftrace_event_pid_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +event_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, int type) { struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; struct trace_pid_list *filtered_pids = NULL; + struct trace_pid_list *other_pids = NULL; struct trace_pid_list *pid_list; struct trace_event_file *file; ssize_t ret; @@ -1620,14 +1730,26 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, mutex_lock(&event_mutex); - filtered_pids = rcu_dereference_protected(tr->filtered_pids, - lockdep_is_held(&event_mutex)); + if (type == TRACE_PIDS) { + filtered_pids = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + other_pids = rcu_dereference_protected(tr->filtered_no_pids, + lockdep_is_held(&event_mutex)); + } else { + filtered_pids = rcu_dereference_protected(tr->filtered_no_pids, + lockdep_is_held(&event_mutex)); + other_pids = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + } ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) goto out; - rcu_assign_pointer(tr->filtered_pids, pid_list); + if (type == TRACE_PIDS) + rcu_assign_pointer(tr->filtered_pids, pid_list); + else + rcu_assign_pointer(tr->filtered_no_pids, pid_list); list_for_each_entry(file, &tr->events, list) { set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); @@ -1636,32 +1758,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { tracepoint_synchronize_unregister(); trace_free_pid_list(filtered_pids); - } else if (pid_list) { - /* - * Register a probe that is called before all other probes - * to set ignore_pid if next or prev do not match. - * Register a probe this is called after all other probes - * to only keep ignore_pid set if next pid matches. - */ - register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre, - tr, INT_MAX); - register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post, - tr, 0); - - register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, - tr, INT_MAX); - register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, - tr, 0); - - register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, - tr, INT_MAX); - register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, - tr, 0); - - register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre, - tr, INT_MAX); - register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post, - tr, 0); + } else if (pid_list && !other_pids) { + register_pid_events(tr); } /* @@ -1680,9 +1778,24 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, return ret; } +static ssize_t +ftrace_event_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return event_pid_write(filp, ubuf, cnt, ppos, TRACE_PIDS); +} + +static ssize_t +ftrace_event_npid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return event_pid_write(filp, ubuf, cnt, ppos, TRACE_NO_PIDS); +} + static int ftrace_event_avail_open(struct inode *inode, struct file *file); static int ftrace_event_set_open(struct inode *inode, struct file *file); static int ftrace_event_set_pid_open(struct inode *inode, struct file *file); +static int ftrace_event_set_npid_open(struct inode *inode, struct file *file); static int ftrace_event_release(struct inode *inode, struct file *file); static const struct seq_operations show_event_seq_ops = { @@ -1706,6 +1819,13 @@ static const struct seq_operations show_set_pid_seq_ops = { .stop = p_stop, }; +static const struct seq_operations show_set_no_pid_seq_ops = { + .start = np_start, + .next = np_next, + .show = trace_pid_show, + .stop = p_stop, +}; + static const struct file_operations ftrace_avail_fops = { .open = ftrace_event_avail_open, .read = seq_read, @@ -1729,6 +1849,14 @@ static const struct file_operations ftrace_set_event_pid_fops = { .release = ftrace_event_release, }; +static const struct file_operations ftrace_set_event_notrace_pid_fops = { + .open = ftrace_event_set_npid_open, + .read = seq_read, + .write = ftrace_event_npid_write, + .llseek = seq_lseek, + .release = ftrace_event_release, +}; + static const struct file_operations ftrace_enable_fops = { .open = tracing_open_generic, .read = event_enable_read, @@ -1858,7 +1986,28 @@ ftrace_event_set_pid_open(struct inode *inode, struct file *file) if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_clear_event_pids(tr); + ftrace_clear_event_pids(tr, TRACE_PIDS); + + ret = ftrace_event_open(inode, file, seq_ops); + if (ret < 0) + trace_array_put(tr); + return ret; +} + +static int +ftrace_event_set_npid_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_set_no_pid_seq_ops; + struct trace_array *tr = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_clear_event_pids(tr, TRACE_NO_PIDS); ret = ftrace_event_open(inode, file, seq_ops); if (ret < 0) @@ -3075,6 +3224,11 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) if (!entry) pr_warn("Could not create tracefs 'set_event_pid' entry\n"); + entry = tracefs_create_file("set_event_notrace_pid", 0644, parent, + tr, &ftrace_set_event_notrace_pid_fops); + if (!entry) + pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n"); + /* ring buffer internal formats */ entry = trace_create_file("header_page", 0444, d_events, ring_buffer_print_page_header, @@ -3158,7 +3312,7 @@ int event_trace_del_tracer(struct trace_array *tr) clear_event_triggers(tr); /* Clear the pid list */ - __ftrace_clear_event_pids(tr); + __ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS); /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index dd34a1b46a86..3a74736da363 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1088,14 +1088,10 @@ register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) { - int ret = register_trigger(glob, ops, data, file); - - if (ret > 0 && tracing_alloc_snapshot_instance(file->tr) != 0) { - unregister_trigger(glob, ops, data, file); - ret = 0; - } + if (tracing_alloc_snapshot_instance(file->tr) != 0) + return 0; - return ret; + return register_trigger(glob, ops, data, file); } static int diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7d71546ba00a..4a9c49c08ec9 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -482,7 +482,7 @@ get_return_for_leaf(struct trace_iterator *iter, /* this is a leaf, now advance the iterator */ if (ring_iter) - ring_buffer_read(ring_iter, NULL); + ring_buffer_iter_advance(ring_iter); return next; } diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index a48808c43249..e2be7bb7ef7e 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -83,6 +83,7 @@ struct hwlat_sample { u64 nmi_total_ts; /* Total time spent in NMIs */ struct timespec64 timestamp; /* wall time */ int nmi_count; /* # NMIs during this sample */ + int count; /* # of iteratons over threash */ }; /* keep the global state somewhere. */ @@ -124,6 +125,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) entry->timestamp = sample->timestamp; entry->nmi_total_ts = sample->nmi_total_ts; entry->nmi_count = sample->nmi_count; + entry->count = sample->count; if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit_nostack(buffer, event); @@ -167,12 +169,14 @@ void trace_hwlat_callback(bool enter) static int get_sample(void) { struct trace_array *tr = hwlat_trace; + struct hwlat_sample s; time_type start, t1, t2, last_t2; - s64 diff, total, last_total = 0; + s64 diff, outer_diff, total, last_total = 0; u64 sample = 0; u64 thresh = tracing_thresh; u64 outer_sample = 0; int ret = -1; + unsigned int count = 0; do_div(thresh, NSEC_PER_USEC); /* modifies interval value */ @@ -186,6 +190,7 @@ static int get_sample(void) init_time(last_t2, 0); start = time_get(); /* start timestamp */ + outer_diff = 0; do { @@ -194,14 +199,14 @@ static int get_sample(void) if (time_u64(last_t2)) { /* Check the delta from outer loop (t2 to next t1) */ - diff = time_to_us(time_sub(t1, last_t2)); + outer_diff = time_to_us(time_sub(t1, last_t2)); /* This shouldn't happen */ - if (diff < 0) { + if (outer_diff < 0) { pr_err(BANNER "time running backwards\n"); goto out; } - if (diff > outer_sample) - outer_sample = diff; + if (outer_diff > outer_sample) + outer_sample = outer_diff; } last_t2 = t2; @@ -217,6 +222,12 @@ static int get_sample(void) /* This checks the inner loop (t1 to t2) */ diff = time_to_us(time_sub(t2, t1)); /* current diff */ + if (diff > thresh || outer_diff > thresh) { + if (!count) + ktime_get_real_ts64(&s.timestamp); + count++; + } + /* This shouldn't happen */ if (diff < 0) { pr_err(BANNER "time running backwards\n"); @@ -236,7 +247,6 @@ static int get_sample(void) /* If we exceed the threshold value, we have found a hardware latency */ if (sample > thresh || outer_sample > thresh) { - struct hwlat_sample s; u64 latency; ret = 1; @@ -249,9 +259,9 @@ static int get_sample(void) s.seqnum = hwlat_data.count; s.duration = sample; s.outer_duration = outer_sample; - ktime_get_real_ts64(&s.timestamp); s.nmi_total_ts = nmi_total_ts; s.nmi_count = nmi_count; + s.count = count; trace_hwlat_sample(&s); latency = max(sample, outer_sample); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 362cca52f5de..d0568af4a0ef 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1078,6 +1078,8 @@ static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev) int i; seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p'); + if (trace_kprobe_is_return(tk) && tk->rp.maxactive) + seq_printf(m, "%d", tk->rp.maxactive); seq_printf(m, ":%s/%s", trace_probe_group_name(&tk->tp), trace_probe_name(&tk->tp)); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index b4909082f6a4..9a121e147102 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -617,22 +617,19 @@ int trace_print_context(struct trace_iterator *iter) int trace_print_lat_context(struct trace_iterator *iter) { + struct trace_entry *entry, *next_entry; struct trace_array *tr = iter->tr; - /* trace_find_next_entry will reset ent_size */ - int ent_size = iter->ent_size; struct trace_seq *s = &iter->seq; - u64 next_ts; - struct trace_entry *entry = iter->ent, - *next_entry = trace_find_next_entry(iter, NULL, - &next_ts); unsigned long verbose = (tr->trace_flags & TRACE_ITER_VERBOSE); + u64 next_ts; - /* Restore the original ent_size */ - iter->ent_size = ent_size; - + next_entry = trace_find_next_entry(iter, NULL, &next_ts); if (!next_entry) next_ts = iter->ts; + /* trace_find_next_entry() may change iter->ent */ + entry = iter->ent; + if (verbose) { char comm[TASK_COMM_LEN]; @@ -1158,12 +1155,12 @@ trace_hwlat_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld", + trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld count:%d", field->seqnum, field->duration, field->outer_duration, (long long)field->timestamp.tv_sec, - field->timestamp.tv_nsec); + field->timestamp.tv_nsec, field->count); if (field->nmi_count) { /* diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 18d16f3ef980..2a8e8e9c1c75 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1333,8 +1333,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, int size, esize; int rctx; - if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) - return; + if (bpf_prog_array_valid(call)) { + u32 ret; + + preempt_disable(); + ret = trace_call_bpf(call, regs); + preempt_enable(); + if (!ret) + return; + } esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); diff --git a/kernel/ucount.c b/kernel/ucount.c index a53cc2b4179c..11b1596e2542 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -69,6 +69,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_net_namespaces"), UCOUNT_ENTRY("max_mnt_namespaces"), UCOUNT_ENTRY("max_cgroup_namespaces"), + UCOUNT_ENTRY("max_time_namespaces"), #ifdef CONFIG_INOTIFY_USER UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), @@ -81,6 +82,8 @@ bool setup_userns_sysctls(struct user_namespace *ns) { #ifdef CONFIG_SYSCTL struct ctl_table *tbl; + + BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS + 1); setup_sysctl_set(&ns->set, &set_root, set_is_seen); tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); if (tbl) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4e01c448b4b4..891ccad5f271 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -858,7 +858,8 @@ void wq_worker_running(struct task_struct *task) * @task: task going to sleep * * This function is called from schedule() when a busy worker is - * going to sleep. + * going to sleep. Preemption needs to be disabled to protect ->sleeping + * assignment. */ void wq_worker_sleeping(struct task_struct *task) { @@ -875,7 +876,8 @@ void wq_worker_sleeping(struct task_struct *task) pool = worker->pool; - if (WARN_ON_ONCE(worker->sleeping)) + /* Return if preempted before wq_worker_running() was reached */ + if (worker->sleeping) return; worker->sleeping = 1; @@ -2834,7 +2836,7 @@ void flush_workqueue(struct workqueue_struct *wq) * First flushers are responsible for cascading flushes and * handling overflow. Non-first flushers can simply return. */ - if (wq->first_flusher != &this_flusher) + if (READ_ONCE(wq->first_flusher) != &this_flusher) return; mutex_lock(&wq->mutex); @@ -2843,7 +2845,7 @@ void flush_workqueue(struct workqueue_struct *wq) if (wq->first_flusher != &this_flusher) goto out_unlock; - wq->first_flusher = NULL; + WRITE_ONCE(wq->first_flusher, NULL); WARN_ON_ONCE(!list_empty(&this_flusher.list)); WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); @@ -5898,7 +5900,7 @@ static void __init wq_numa_init(void) * items. Actual work item execution starts only after kthreads can be * created and scheduled right before early initcalls. */ -int __init workqueue_init_early(void) +void __init workqueue_init_early(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; @@ -5965,8 +5967,6 @@ int __init workqueue_init_early(void) !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); - - return 0; } /** @@ -5978,7 +5978,7 @@ int __init workqueue_init_early(void) * are no kworkers executing the work items yet. Populate the worker pools * with the initial workers and enable future kworker creations. */ -int __init workqueue_init(void) +void __init workqueue_init(void) { struct workqueue_struct *wq; struct worker_pool *pool; @@ -6025,6 +6025,4 @@ int __init workqueue_init(void) wq_online = true; wq_watchdog_init(); - - return 0; } |