diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2015-02-10 11:35:36 -0800 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2015-02-10 11:35:36 -0800 |
commit | 4ba24fef3eb3b142197135223b90ced2f319cd53 (patch) | |
tree | a20c125b27740ec7b4c761b11d801108e1b316b2 /kernel | |
parent | 47c1ffb2b6b630894e9a16442611c056ab21c057 (diff) | |
parent | 98a4a59ee31a12105a2b84f5b8b515ac2cb208ef (diff) |
Merge branch 'next' into for-linus
Prepare first round of input updates for 3.20.
Diffstat (limited to 'kernel')
171 files changed, 12976 insertions, 5148 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index dc5c77544fd6..a59481a3fa6c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o -obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o @@ -86,7 +85,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o -obj-$(CONFIG_NET) += bpf/ +obj-$(CONFIG_BPF) += bpf/ obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/acct.c b/kernel/acct.c index b4c667d22e79..33738ef972f3 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -472,7 +472,6 @@ static void do_acct_process(struct bsd_acct_struct *acct) acct_t ac; unsigned long flim; const struct cred *orig_cred; - struct pid_namespace *ns = acct->ns; struct file *file = acct->file; /* @@ -500,10 +499,15 @@ static void do_acct_process(struct bsd_acct_struct *acct) ac.ac_gid16 = ac.ac_gid; #endif #if ACCT_VERSION == 3 - ac.ac_pid = task_tgid_nr_ns(current, ns); - rcu_read_lock(); - ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); - rcu_read_unlock(); + { + struct pid_namespace *ns = acct->ns; + + ac.ac_pid = task_tgid_nr_ns(current, ns); + rcu_read_lock(); + ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), + ns); + rcu_read_unlock(); + } #endif /* * Get freeze protection. If the fs is frozen, just skip the write diff --git a/kernel/async.c b/kernel/async.c index 61f023ce0228..4c3773c0bf63 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work) /* 1) run (and print duration) */ if (initcall_debug && system_state == SYSTEM_BOOTING) { - printk(KERN_DEBUG "calling %lli_%pF @ %i\n", + pr_debug("calling %lli_%pF @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); @@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work) if (initcall_debug && system_state == SYSTEM_BOOTING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n", + pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", (long long)entry->cookie, entry->func, (long long)ktime_to_ns(delta) >> 10); @@ -285,7 +285,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain ktime_t uninitialized_var(starttime), delta, endtime; if (initcall_debug && system_state == SYSTEM_BOOTING) { - printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); + pr_debug("async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); } @@ -295,7 +295,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain endtime = ktime_get(); delta = ktime_sub(endtime, starttime); - printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n", + pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current), (long long)ktime_to_ns(delta) >> 10); } diff --git a/kernel/audit.c b/kernel/audit.c index ba2ff5a5c600..72ab759a0b43 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -126,7 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; -int audit_net_id; +static int audit_net_id; /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -429,7 +429,7 @@ static void kauditd_send_skb(struct sk_buff *skb) * This function doesn't consume an skb as might be expected since it has to * copy it anyways. */ -static void kauditd_send_multicast_skb(struct sk_buff *skb) +static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *copy; struct audit_net *aunet = net_generic(&init_net, audit_net_id); @@ -448,11 +448,11 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb) * no reason for new multicast clients to continue with this * non-compliance. */ - copy = skb_copy(skb, GFP_KERNEL); + copy = skb_copy(skb, gfp_mask); if (!copy) return; - nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); + nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask); } /* @@ -499,7 +499,6 @@ static int kauditd_thread(void *dummy) set_freezable(); while (!kthread_should_stop()) { struct sk_buff *skb; - DECLARE_WAITQUEUE(wait, current); flush_hold_queue(); @@ -514,16 +513,8 @@ static int kauditd_thread(void *dummy) audit_printk_skb(skb); continue; } - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kauditd_wait, &wait); - if (!skb_queue_len(&audit_skb_queue)) { - try_to_freeze(); - schedule(); - } - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&kauditd_wait, &wait); + wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue)); } return 0; } @@ -724,7 +715,7 @@ static int audit_get_feature(struct sk_buff *skb) seq = nlmsg_hdr(skb)->nlmsg_seq; - audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af)); + audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af)); return 0; } @@ -739,7 +730,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); audit_log_task_info(ab, current); - audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", + audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", audit_feature_names[which], !!old_feature, !!new_feature, !!old_lock, !!new_lock, res); audit_log_end(ab); @@ -750,7 +741,7 @@ static int audit_set_feature(struct sk_buff *skb) struct audit_features *uaf; int i; - BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0])); + BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); uaf = nlmsg_data(nlmsg_hdr(skb)); /* if there is ever a version 2 we should handle that here */ @@ -842,7 +833,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); s.backlog = skb_queue_len(&audit_skb_queue); - s.version = AUDIT_VERSION_LATEST; + s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; s.backlog_wait_time = audit_backlog_wait_time; audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; @@ -1109,7 +1100,7 @@ static void audit_receive(struct sk_buff *skb) } /* Run custom bind function on netlink socket group connect or bind requests. */ -static int audit_bind(int group) +static int audit_bind(struct net *net, int group) { if (!capable(CAP_AUDIT_READ)) return -EPERM; @@ -1301,19 +1292,9 @@ err: */ unsigned int audit_serial(void) { - static DEFINE_SPINLOCK(serial_lock); - static unsigned int serial = 0; - - unsigned long flags; - unsigned int ret; - - spin_lock_irqsave(&serial_lock, flags); - do { - ret = ++serial; - } while (unlikely(!ret)); - spin_unlock_irqrestore(&serial_lock, flags); + static atomic_t serial = ATOMIC_INIT(0); - return ret; + return atomic_add_return(1, &serial); } static inline void audit_get_stamp(struct audit_context *ctx, @@ -1681,7 +1662,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) } } -void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) +static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) { kernel_cap_t *perm = &name->fcap.permitted; kernel_cap_t *inh = &name->fcap.inheritable; @@ -1860,7 +1841,7 @@ EXPORT_SYMBOL(audit_log_task_context); void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { const struct cred *cred; - char name[sizeof(tsk->comm)]; + char comm[sizeof(tsk->comm)]; struct mm_struct *mm = tsk->mm; char *tty; @@ -1894,9 +1875,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) from_kgid(&init_user_ns, cred->fsgid), tty, audit_get_sessionid(tsk)); - get_task_comm(name, tsk); audit_log_format(ab, " comm="); - audit_log_untrustedstring(ab, name); + audit_log_untrustedstring(ab, get_task_comm(comm, tsk)); if (mm) { down_read(&mm->mmap_sem); @@ -1959,7 +1939,8 @@ void audit_log_end(struct audit_buffer *ab) } else { struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - kauditd_send_multicast_skb(ab->skb); + nlh->nlmsg_len = ab->skb->len; + kauditd_send_multicast_skb(ab->skb, ab->gfp_mask); /* * The original kaudit unicast socket sends up messages with @@ -1970,7 +1951,7 @@ void audit_log_end(struct audit_buffer *ab) * protocol between the kaudit kernel subsystem and the auditd * userspace code. */ - nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; + nlh->nlmsg_len -= NLMSG_HDRLEN; if (audit_pid) { skb_queue_tail(&audit_skb_queue, ab->skb); diff --git a/kernel/audit.h b/kernel/audit.h index 7bb65730c890..3cdffad5a1d9 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -222,7 +222,6 @@ extern void audit_copy_inode(struct audit_names *name, const struct inode *inode); extern void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap); -extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name); extern void audit_log_name(struct audit_context *context, struct audit_names *n, struct path *path, int record_num, int *call_panic); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 135944a7b28a..2e0c97427b33 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -154,6 +154,7 @@ static struct audit_chunk *alloc_chunk(int count) chunk->owners[i].index = i; } fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); + chunk->mark.mask = FS_IN_IGNORED; return chunk; } @@ -173,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk) struct fsnotify_mark *entry = &chunk->mark; struct list_head *list; - if (!entry->i.inode) + if (!entry->inode) return; - list = chunk_hash(entry->i.inode); + list = chunk_hash(entry->inode); list_add_rcu(&chunk->hash, list); } @@ -187,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) list_for_each_entry_rcu(p, list, hash) { /* mark.inode may have gone NULL, but who cares? */ - if (p->mark.i.inode == inode) { + if (p->mark.inode == inode) { atomic_long_inc(&p->refs); return p; } @@ -230,7 +231,7 @@ static void untag_chunk(struct node *p) new = alloc_chunk(size); spin_lock(&entry->lock); - if (chunk->dead || !entry->i.inode) { + if (chunk->dead || !entry->inode) { spin_unlock(&entry->lock); if (new) free_chunk(new); @@ -257,7 +258,7 @@ static void untag_chunk(struct node *p) goto Fallback; fsnotify_duplicate_mark(&new->mark, entry); - if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { + if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) { fsnotify_put_mark(&new->mark); goto Fallback; } @@ -385,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk_entry = &chunk->mark; spin_lock(&old_entry->lock); - if (!old_entry->i.inode) { + if (!old_entry->inode) { /* old_entry is being shot, lets just lie */ spin_unlock(&old_entry->lock); fsnotify_put_mark(old_entry); @@ -394,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) } fsnotify_duplicate_mark(chunk_entry, old_entry); - if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { + if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) { spin_unlock(&old_entry->lock); fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); @@ -449,7 +450,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return 0; } -static void audit_log_remove_rule(struct audit_krule *rule) +static void audit_tree_log_remove_rule(struct audit_krule *rule) { struct audit_buffer *ab; @@ -457,7 +458,7 @@ static void audit_log_remove_rule(struct audit_krule *rule) if (unlikely(!ab)) return; audit_log_format(ab, "op="); - audit_log_string(ab, "remove rule"); + audit_log_string(ab, "remove_rule"); audit_log_format(ab, " dir="); audit_log_untrustedstring(ab, rule->tree->pathname); audit_log_key(ab, rule->filterkey); @@ -476,7 +477,7 @@ static void kill_rules(struct audit_tree *tree) list_del_init(&rule->rlist); if (rule->tree) { /* not a half-baked one */ - audit_log_remove_rule(rule); + audit_tree_log_remove_rule(rule); rule->tree = NULL; list_del_rcu(&entry->list); list_del(&entry->rule.list); @@ -610,7 +611,7 @@ void audit_trim_trees(void) list_for_each_entry(node, &tree->chunks, list) { struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ - struct inode *inode = chunk->mark.i.inode; + struct inode *inode = chunk->mark.inode; node->index |= 1U<<31; if (iterate_mounts(compare_root, inode, root_mnt)) node->index &= ~(1U<<31); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 70b4554d2fbe..ad9c1682f616 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -314,7 +314,7 @@ static void audit_update_watch(struct audit_parent *parent, &nentry->rule.list); } - audit_watch_log_rule_change(r, owatch, "updated rules"); + audit_watch_log_rule_change(r, owatch, "updated_rules"); call_rcu(&oentry->rcu, audit_free_rule_rcu); } @@ -342,7 +342,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { list_for_each_entry_safe(r, nextr, &w->rules, rlist) { e = container_of(r, struct audit_entry, rule); - audit_watch_log_rule_change(r, w, "remove rule"); + audit_watch_log_rule_change(r, w, "remove_rule"); list_del(&r->rlist); list_del(&r->list); list_del_rcu(&e->list); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index c447cd9848d1..4f68a326d92e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -71,6 +71,24 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { DEFINE_MUTEX(audit_filter_mutex); +static void audit_free_lsm_field(struct audit_field *f) +{ + switch (f->type) { + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: + kfree(f->lsm_str); + security_audit_rule_free(f->lsm_rule); + } +} + static inline void audit_free_rule(struct audit_entry *e) { int i; @@ -80,11 +98,8 @@ static inline void audit_free_rule(struct audit_entry *e) if (erule->watch) audit_put_watch(erule->watch); if (erule->fields) - for (i = 0; i < erule->field_count; i++) { - struct audit_field *f = &erule->fields[i]; - kfree(f->lsm_str); - security_audit_rule_free(f->lsm_rule); - } + for (i = 0; i < erule->field_count; i++) + audit_free_lsm_field(&erule->fields[i]); kfree(erule->fields); kfree(erule->filterkey); kfree(e); @@ -148,7 +163,7 @@ static inline int audit_to_inode(struct audit_krule *krule, struct audit_field *f) { if (krule->listnr != AUDIT_FILTER_EXIT || - krule->watch || krule->inode_f || krule->tree || + krule->inode_f || krule->watch || krule->tree || (f->op != Audit_equal && f->op != Audit_not_equal)) return -EINVAL; @@ -422,28 +437,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->type = data->fields[i]; f->val = data->values[i]; - f->uid = INVALID_UID; - f->gid = INVALID_GID; - f->lsm_str = NULL; - f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; - } - - if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) { - struct pid *pid; - rcu_read_lock(); - pid = find_vpid(f->val); - if (!pid) { - rcu_read_unlock(); - err = -ESRCH; - goto exit_free; - } - f->val = pid_nr(pid); - rcu_read_unlock(); + entry->rule.pflags |= AUDIT_LOGINUID_LEGACY; } err = audit_field_valid(entry, f); @@ -619,6 +618,13 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->buflen += data->values[i] = audit_pack_string(&bufp, krule->filterkey); break; + case AUDIT_LOGINUID_SET: + if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) { + data->fields[i] = AUDIT_LOGINUID; + data->values[i] = AUDIT_UID_UNSET; + break; + } + /* fallthrough if set */ default: data->values[i] = f->val; } @@ -635,6 +641,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) int i; if (a->flags != b->flags || + a->pflags != b->pflags || a->listnr != b->listnr || a->action != b->action || a->field_count != b->field_count) @@ -753,6 +760,7 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old) new = &entry->rule; new->vers_ops = old->vers_ops; new->flags = old->flags; + new->pflags = old->pflags; new->listnr = old->listnr; new->action = old->action; for (i = 0; i < AUDIT_BITMASK_SIZE; i++) @@ -1053,30 +1061,27 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, int err = 0; struct audit_entry *entry; + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); + switch (type) { case AUDIT_ADD_RULE: - entry = audit_data_to_entry(data, datasz); - if (IS_ERR(entry)) - return PTR_ERR(entry); - err = audit_add_rule(entry); - audit_log_rule_change("add rule", &entry->rule, !err); - if (err) - audit_free_rule(entry); + audit_log_rule_change("add_rule", &entry->rule, !err); break; case AUDIT_DEL_RULE: - entry = audit_data_to_entry(data, datasz); - if (IS_ERR(entry)) - return PTR_ERR(entry); - err = audit_del_rule(entry); - audit_log_rule_change("remove rule", &entry->rule, !err); - audit_free_rule(entry); + audit_log_rule_change("remove_rule", &entry->rule, !err); break; default: - return -EINVAL; + err = -EINVAL; + WARN_ON(1); } + if (err || type == AUDIT_DEL_RULE) + audit_free_rule(entry); + return err; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 21eae3c05ec0..072566dd0caf 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -67,10 +67,13 @@ #include <linux/binfmts.h> #include <linux/highmem.h> #include <linux/syscalls.h> +#include <asm/syscall.h> #include <linux/capability.h> #include <linux/fs_struct.h> #include <linux/compat.h> #include <linux/ctype.h> +#include <linux/string.h> +#include <uapi/linux/limits.h> #include "audit.h" @@ -125,14 +128,6 @@ struct audit_tree_refs { struct audit_chunk *c[31]; }; -static inline int open_arg(int flags, int mask) -{ - int n = ACC_MODE(flags); - if (flags & (O_TRUNC | O_CREAT)) - n |= AUDIT_PERM_WRITE; - return n & mask; -} - static int audit_match_perm(struct audit_context *ctx, int mask) { unsigned n; @@ -1505,7 +1500,6 @@ void __audit_free(struct task_struct *tsk) /** * audit_syscall_entry - fill in an audit record at syscall entry - * @arch: architecture type * @major: major syscall type (function) * @a1: additional syscall register 1 * @a2: additional syscall register 2 @@ -1520,9 +1514,8 @@ void __audit_free(struct task_struct *tsk) * will only be written if another part of the kernel requests that it * be written). */ -void __audit_syscall_entry(int arch, int major, - unsigned long a1, unsigned long a2, - unsigned long a3, unsigned long a4) +void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, + unsigned long a3, unsigned long a4) { struct task_struct *tsk = current; struct audit_context *context = tsk->audit_context; @@ -1536,7 +1529,7 @@ void __audit_syscall_entry(int arch, int major, if (!audit_enabled) return; - context->arch = arch; + context->arch = syscall_get_arch(); context->major = major; context->argv[0] = a1; context->argv[1] = a2; @@ -1870,8 +1863,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, } list_for_each_entry_reverse(n, &context->names_list, list) { - /* does the name pointer match? */ - if (!n->name || n->name->name != name->name) + if (!n->name || strcmp(n->name->name, name->name)) continue; /* match the correct record type */ @@ -1886,12 +1878,48 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, } out_alloc: - /* unable to find the name from a previous getname(). Allocate a new - * anonymous entry. - */ - n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); + /* unable to find an entry with both a matching name and type */ + n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; + /* unfortunately, while we may have a path name to record with the + * inode, we can't always rely on the string lasting until the end of + * the syscall so we need to create our own copy, it may fail due to + * memory allocation issues, but we do our best */ + if (name) { + /* we can't use getname_kernel() due to size limits */ + size_t len = strlen(name->name) + 1; + struct filename *new = __getname(); + + if (unlikely(!new)) + goto out; + + if (len <= (PATH_MAX - sizeof(*new))) { + new->name = (char *)(new) + sizeof(*new); + new->separate = false; + } else if (len <= PATH_MAX) { + /* this looks odd, but is due to final_putname() */ + struct filename *new2; + + new2 = kmalloc(sizeof(*new2), GFP_KERNEL); + if (unlikely(!new2)) { + __putname(new); + goto out; + } + new2->name = (char *)new; + new2->separate = true; + new = new2; + } else { + /* we should never get here, but let's be safe */ + __putname(new); + goto out; + } + strlcpy((char *)new->name, name->name, len); + new->uptr = NULL; + new->aname = n; + n->name = new; + n->name_put = true; + } out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; @@ -1906,6 +1934,11 @@ out: audit_copy_inode(n, dentry, inode); } +void __audit_file(const struct file *file) +{ + __audit_inode(NULL, file->f_path.dentry, 0); +} + /** * __audit_inode_child - collect inode info for created/removed objects * @parent: inode of dentry parent @@ -2382,7 +2415,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->d.next = context->aux; context->aux = (void *)ax; - dentry = dget(bprm->file->f_dentry); + dentry = dget(bprm->file->f_path.dentry); get_vfs_caps_from_disk(dentry, &vcaps); dput(dentry); @@ -2406,7 +2439,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, * @new: the new credentials * @old: the old (current) credentials * - * Record the aguments userspace sent to sys_capset for later printing by the + * Record the arguments userspace sent to sys_capset for later printing by the * audit system if applicable */ void __audit_log_capset(const struct cred *new, const struct cred *old) @@ -2433,6 +2466,7 @@ static void audit_log_task(struct audit_buffer *ab) kgid_t gid; unsigned int sessionid; struct mm_struct *mm = current->mm; + char comm[sizeof(current->comm)]; auid = audit_get_loginuid(current); sessionid = audit_get_sessionid(current); @@ -2445,7 +2479,7 @@ static void audit_log_task(struct audit_buffer *ab) sessionid); audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); - audit_log_untrustedstring(ab, current->comm); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); if (mm) { down_read(&mm->mmap_sem); if (mm->exe_file) @@ -2488,11 +2522,9 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) if (unlikely(!ab)) return; audit_log_task(ab); - audit_log_format(ab, " sig=%ld", signr); - audit_log_format(ab, " syscall=%ld", syscall); - audit_log_format(ab, " compat=%d", is_compat_task()); - audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); - audit_log_format(ab, " code=0x%x", code); + audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", + signr, syscall_get_arch(), syscall, is_compat_task(), + KSTK_EIP(current), code); audit_log_end(ab); } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6a71145e2769..a5ae60f0b0a2 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1 +1,5 @@ obj-y := core.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o +ifdef CONFIG_TEST_BPF +obj-$(CONFIG_BPF_SYSCALL) += test_stub.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c new file mode 100644 index 000000000000..9eb4d8a7cd87 --- /dev/null +++ b/kernel/bpf/arraymap.c @@ -0,0 +1,156 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> +#include <linux/mm.h> + +struct bpf_array { + struct bpf_map map; + u32 elem_size; + char value[0] __aligned(8); +}; + +/* Called from syscall */ +static struct bpf_map *array_map_alloc(union bpf_attr *attr) +{ + struct bpf_array *array; + u32 elem_size, array_size; + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size == 0) + return ERR_PTR(-EINVAL); + + elem_size = round_up(attr->value_size, 8); + + /* check round_up into zero and u32 overflow */ + if (elem_size == 0 || + attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size) + return ERR_PTR(-ENOMEM); + + array_size = sizeof(*array) + attr->max_entries * elem_size; + + /* allocate all map elements and zero-initialize them */ + array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); + if (!array) { + array = vzalloc(array_size); + if (!array) + return ERR_PTR(-ENOMEM); + } + + /* copy mandatory map attributes */ + array->map.key_size = attr->key_size; + array->map.value_size = attr->value_size; + array->map.max_entries = attr->max_entries; + + array->elem_size = elem_size; + + return &array->map; +} + +/* Called from syscall or from eBPF program */ +static void *array_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (index >= array->map.max_entries) + return NULL; + + return array->value + array->elem_size * index; +} + +/* Called from syscall */ +static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + u32 *next = (u32 *)next_key; + + if (index >= array->map.max_entries) { + *next = 0; + return 0; + } + + if (index == array->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +/* Called from syscall or from eBPF program */ +static int array_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (map_flags > BPF_EXIST) + /* unknown flags */ + return -EINVAL; + + if (index >= array->map.max_entries) + /* all elements were pre-allocated, cannot insert a new one */ + return -E2BIG; + + if (map_flags == BPF_NOEXIST) + /* all elements already exist */ + return -EEXIST; + + memcpy(array->value + array->elem_size * index, value, array->elem_size); + return 0; +} + +/* Called from syscall or from eBPF program */ +static int array_map_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void array_map_free(struct bpf_map *map) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding programs to complete + * and free the array + */ + synchronize_rcu(); + + kvfree(array); +} + +static struct bpf_map_ops array_ops = { + .map_alloc = array_map_alloc, + .map_free = array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = array_map_lookup_elem, + .map_update_elem = array_map_update_elem, + .map_delete_elem = array_map_delete_elem, +}; + +static struct bpf_map_type_list tl = { + .ops = &array_ops, + .type = BPF_MAP_TYPE_ARRAY, +}; + +static int __init register_array_map(void) +{ + bpf_register_map_type(&tl); + return 0; +} +late_initcall(register_array_map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7f0dbcbb34af..d6594e457a25 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -20,9 +20,14 @@ * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ + #include <linux/filter.h> #include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <linux/random.h> +#include <linux/moduleloader.h> #include <asm/unaligned.h> +#include <linux/bpf.h> /* Registers */ #define BPF_R0 regs[BPF_REG_0] @@ -63,6 +68,105 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns return NULL; } +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog_aux *aux; + struct bpf_prog *fp; + + size = round_up(size, PAGE_SIZE); + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp == NULL) + return NULL; + + aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); + if (aux == NULL) { + vfree(fp); + return NULL; + } + + fp->pages = size / PAGE_SIZE; + fp->aux = aux; + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_alloc); + +struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, + gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog *fp; + + BUG_ON(fp_old == NULL); + + size = round_up(size, PAGE_SIZE); + if (size <= fp_old->pages * PAGE_SIZE) + return fp_old; + + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp != NULL) { + memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); + fp->pages = size / PAGE_SIZE; + + /* We keep fp->aux from fp_old around in the new + * reallocated structure. + */ + fp_old->aux = NULL; + __bpf_prog_free(fp_old); + } + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_realloc); + +void __bpf_prog_free(struct bpf_prog *fp) +{ + kfree(fp->aux); + vfree(fp); +} +EXPORT_SYMBOL_GPL(__bpf_prog_free); + +#ifdef CONFIG_BPF_JIT +struct bpf_binary_header * +bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + struct bpf_binary_header *hdr; + unsigned int size, hole, start; + + /* Most of BPF filters are really small, but if some of them + * fill a page, allow at least 128 extra bytes to insert a + * random section of illegal instructions. + */ + size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); + hdr = module_alloc(size); + if (hdr == NULL) + return NULL; + + /* Fill space with illegal/arch-dep instructions. */ + bpf_fill_ill_insns(hdr, size); + + hdr->pages = size / PAGE_SIZE; + hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), + PAGE_SIZE - sizeof(*hdr)); + start = (prandom_u32() % hole) & ~(alignment - 1); + + /* Leave a random number of instructions before BPF code. */ + *image_ptr = &hdr->image[start]; + + return hdr; +} + +void bpf_jit_binary_free(struct bpf_binary_header *hdr) +{ + module_free(NULL, hdr); +} +#endif /* CONFIG_BPF_JIT */ + /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs * anyway later on, so do not let the compiler omit it. @@ -180,6 +284,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, + [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; void *ptr; int off; @@ -239,6 +344,10 @@ select_insn: ALU64_MOV_K: DST = IMM; CONT; + LD_IMM_DW: + DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; + insn++; + CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= SRC; CONT; @@ -523,12 +632,35 @@ void bpf_prog_select_runtime(struct bpf_prog *fp) /* Probe if internal BPF can be JITed */ bpf_int_jit_compile(fp); + /* Lock whole bpf_prog as read-only */ + bpf_prog_lock_ro(fp); } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); -/* free internal BPF program */ +static void bpf_prog_free_deferred(struct work_struct *work) +{ + struct bpf_prog_aux *aux; + + aux = container_of(work, struct bpf_prog_aux, work); + bpf_jit_free(aux->prog); +} + +/* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { - bpf_jit_free(fp); + struct bpf_prog_aux *aux = fp->aux; + + INIT_WORK(&aux->work, bpf_prog_free_deferred); + aux->prog = fp; + schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); + +/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call + * skb_copy_bits(), so provide a weak definition of it for NET-less config. + */ +int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, + int len) +{ + return -EFAULT; +} diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c new file mode 100644 index 000000000000..b3ba43674310 --- /dev/null +++ b/kernel/bpf/hashtab.c @@ -0,0 +1,367 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/jhash.h> +#include <linux/filter.h> +#include <linux/vmalloc.h> + +struct bpf_htab { + struct bpf_map map; + struct hlist_head *buckets; + spinlock_t lock; + u32 count; /* number of elements in this hashtable */ + u32 n_buckets; /* number of hash buckets */ + u32 elem_size; /* size of each element in bytes */ +}; + +/* each htab element is struct htab_elem + key + value */ +struct htab_elem { + struct hlist_node hash_node; + struct rcu_head rcu; + u32 hash; + char key[0] __aligned(8); +}; + +/* Called from syscall */ +static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int err, i; + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + htab->map.key_size = attr->key_size; + htab->map.value_size = attr->value_size; + htab->map.max_entries = attr->max_entries; + + /* check sanity of attributes. + * value_size == 0 may be allowed in the future to use map as a set + */ + err = -EINVAL; + if (htab->map.max_entries == 0 || htab->map.key_size == 0 || + htab->map.value_size == 0) + goto free_htab; + + /* hash table size must be power of 2 */ + htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); + + err = -E2BIG; + if (htab->map.key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + goto free_htab; + + err = -ENOMEM; + /* prevent zero size kmalloc and check for u32 overflow */ + if (htab->n_buckets == 0 || + htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) + goto free_htab; + + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), + GFP_USER | __GFP_NOWARN); + + if (!htab->buckets) { + htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); + if (!htab->buckets) + goto free_htab; + } + + for (i = 0; i < htab->n_buckets; i++) + INIT_HLIST_HEAD(&htab->buckets[i]); + + spin_lock_init(&htab->lock); + htab->count = 0; + + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8) + + htab->map.value_size; + return &htab->map; + +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static inline u32 htab_map_hash(const void *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, + void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + + return NULL; +} + +/* Called from syscall or from eBPF program */ +static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + u32 hash, key_size; + + /* Must be called with rcu_read_lock. */ + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + head = select_bucket(htab, hash); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) + return l->key + round_up(map->key_size, 8); + + return NULL; +} + +/* Called from syscall */ +static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l, *next_l; + u32 hash, key_size; + int i; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + head = select_bucket(htab, hash); + + /* lookup the key */ + l = lookup_elem_raw(head, hash, key, key_size); + + if (!l) { + i = 0; + goto find_first_elem; + } + + /* key was found, get next key in the same bucket */ + next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + struct htab_elem, hash_node); + + if (next_l) { + /* if next elem in this hash list is non-zero, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + + /* no more elements in this hash list, go to the next bucket */ + i = hash & (htab->n_buckets - 1); + i++; + +find_first_elem: + /* iterate over buckets */ + for (; i < htab->n_buckets; i++) { + head = select_bucket(htab, i); + + /* pick first element in the bucket */ + next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct htab_elem, hash_node); + if (next_l) { + /* if it's not empty, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + } + + /* itereated over all buckets and all elements */ + return -ENOENT; +} + +/* Called from syscall or from eBPF program */ +static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new, *l_old; + struct hlist_head *head; + unsigned long flags; + u32 key_size; + int ret; + + if (map_flags > BPF_EXIST) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* allocate new element outside of lock */ + l_new = kmalloc(htab->elem_size, GFP_ATOMIC); + if (!l_new) + return -ENOMEM; + + key_size = map->key_size; + + memcpy(l_new->key, key, key_size); + memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); + + l_new->hash = htab_map_hash(l_new->key, key_size); + + /* bpf_map_update_elem() can be called in_irq() */ + spin_lock_irqsave(&htab->lock, flags); + + head = select_bucket(htab, l_new->hash); + + l_old = lookup_elem_raw(head, l_new->hash, key, key_size); + + if (!l_old && unlikely(htab->count >= map->max_entries)) { + /* if elem with this 'key' doesn't exist and we've reached + * max_entries limit, fail insertion of new elem + */ + ret = -E2BIG; + goto err; + } + + if (l_old && map_flags == BPF_NOEXIST) { + /* elem already exists */ + ret = -EEXIST; + goto err; + } + + if (!l_old && map_flags == BPF_EXIST) { + /* elem doesn't exist, cannot update it */ + ret = -ENOENT; + goto err; + } + + /* add new element to the head of the list, so that concurrent + * search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + hlist_del_rcu(&l_old->hash_node); + kfree_rcu(l_old, rcu); + } else { + htab->count++; + } + spin_unlock_irqrestore(&htab->lock, flags); + + return 0; +err: + spin_unlock_irqrestore(&htab->lock, flags); + kfree(l_new); + return ret; +} + +/* Called from syscall or from eBPF program */ +static int htab_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + unsigned long flags; + u32 hash, key_size; + int ret = -ENOENT; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + spin_lock_irqsave(&htab->lock, flags); + + head = select_bucket(htab, hash); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) { + hlist_del_rcu(&l->hash_node); + htab->count--; + kfree_rcu(l, rcu); + ret = 0; + } + + spin_unlock_irqrestore(&htab->lock, flags); + return ret; +} + +static void delete_all_elements(struct bpf_htab *htab) +{ + int i; + + for (i = 0; i < htab->n_buckets; i++) { + struct hlist_head *head = select_bucket(htab, i); + struct hlist_node *n; + struct htab_elem *l; + + hlist_for_each_entry_safe(l, n, head, hash_node) { + hlist_del_rcu(&l->hash_node); + htab->count--; + kfree(l); + } + } +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void htab_map_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + /* some of kfree_rcu() callbacks for elements of this map may not have + * executed. It's ok. Proceed to free residual elements and map itself + */ + delete_all_elements(htab); + kvfree(htab->buckets); + kfree(htab); +} + +static struct bpf_map_ops htab_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_map_lookup_elem, + .map_update_elem = htab_map_update_elem, + .map_delete_elem = htab_map_delete_elem, +}; + +static struct bpf_map_type_list tl = { + .ops = &htab_ops, + .type = BPF_MAP_TYPE_HASH, +}; + +static int __init register_htab_map(void) +{ + bpf_register_map_type(&tl); + return 0; +} +late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c new file mode 100644 index 000000000000..9e3414d85459 --- /dev/null +++ b/kernel/bpf/helpers.c @@ -0,0 +1,89 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/rcupdate.h> + +/* If kernel subsystem is allowing eBPF programs to call this function, + * inside its own verifier_ops->get_func_proto() callback it should return + * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments + * + * Different map implementations will rely on rcu in map methods + * lookup/update/delete, therefore eBPF programs must run under rcu lock + * if program is allowed to access maps, so check rcu_read_lock_held in + * all three functions. + */ +static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* verifier checked that R1 contains a valid pointer to bpf_map + * and R2 points to a program stack and map->key_size bytes were + * initialized + */ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + void *value; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + value = map->ops->map_lookup_elem(map, key); + + /* lookup() returns either pointer to element value or NULL + * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type + */ + return (unsigned long) value; +} + +struct bpf_func_proto bpf_map_lookup_elem_proto = { + .func = bpf_map_lookup_elem, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, +}; + +static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + void *value = (void *) (unsigned long) r3; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + return map->ops->map_update_elem(map, key, value, r4); +} + +struct bpf_func_proto bpf_map_update_elem_proto = { + .func = bpf_map_update_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, + .arg3_type = ARG_PTR_TO_MAP_VALUE, + .arg4_type = ARG_ANYTHING, +}; + +static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + return map->ops->map_delete_elem(map, key); +} + +struct bpf_func_proto bpf_map_delete_elem_proto = { + .func = bpf_map_delete_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c new file mode 100644 index 000000000000..088ac0b1b106 --- /dev/null +++ b/kernel/bpf/syscall.c @@ -0,0 +1,606 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/syscalls.h> +#include <linux/slab.h> +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/license.h> +#include <linux/filter.h> + +static LIST_HEAD(bpf_map_types); + +static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) +{ + struct bpf_map_type_list *tl; + struct bpf_map *map; + + list_for_each_entry(tl, &bpf_map_types, list_node) { + if (tl->type == attr->map_type) { + map = tl->ops->map_alloc(attr); + if (IS_ERR(map)) + return map; + map->ops = tl->ops; + map->map_type = attr->map_type; + return map; + } + } + return ERR_PTR(-EINVAL); +} + +/* boot time registration of different map implementations */ +void bpf_register_map_type(struct bpf_map_type_list *tl) +{ + list_add(&tl->list_node, &bpf_map_types); +} + +/* called from workqueue */ +static void bpf_map_free_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_map, work); + + /* implementation dependent freeing */ + map->ops->map_free(map); +} + +/* decrement map refcnt and schedule it for freeing via workqueue + * (unrelying map implementation ops->map_free() might sleep) + */ +void bpf_map_put(struct bpf_map *map) +{ + if (atomic_dec_and_test(&map->refcnt)) { + INIT_WORK(&map->work, bpf_map_free_deferred); + schedule_work(&map->work); + } +} + +static int bpf_map_release(struct inode *inode, struct file *filp) +{ + struct bpf_map *map = filp->private_data; + + bpf_map_put(map); + return 0; +} + +static const struct file_operations bpf_map_fops = { + .release = bpf_map_release, +}; + +/* helper macro to check that unused fields 'union bpf_attr' are zero */ +#define CHECK_ATTR(CMD) \ + memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ + sizeof(attr->CMD##_LAST_FIELD), 0, \ + sizeof(*attr) - \ + offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ + sizeof(attr->CMD##_LAST_FIELD)) != NULL + +#define BPF_MAP_CREATE_LAST_FIELD max_entries +/* called via syscall */ +static int map_create(union bpf_attr *attr) +{ + struct bpf_map *map; + int err; + + err = CHECK_ATTR(BPF_MAP_CREATE); + if (err) + return -EINVAL; + + /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ + map = find_and_alloc_map(attr); + if (IS_ERR(map)) + return PTR_ERR(map); + + atomic_set(&map->refcnt, 1); + + err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); + + if (err < 0) + /* failed to allocate fd */ + goto free_map; + + return err; + +free_map: + map->ops->map_free(map); + return err; +} + +/* if error is returned, fd is released. + * On success caller should complete fd access with matching fdput() + */ +struct bpf_map *bpf_map_get(struct fd f) +{ + struct bpf_map *map; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_map_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + map = f.file->private_data; + + return map; +} + +/* helper to convert user pointers passed inside __aligned_u64 fields */ +static void __user *u64_to_ptr(__u64 val) +{ + return (void __user *) (unsigned long) val; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value + +static int map_lookup_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value; + int err; + + if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOENT; + rcu_read_lock(); + value = map->ops->map_lookup_elem(map, key); + if (!value) + goto err_unlock; + + err = -EFAULT; + if (copy_to_user(uvalue, value, map->value_size) != 0) + goto err_unlock; + + err = 0; + +err_unlock: + rcu_read_unlock(); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags + +static int map_update_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value; + int err; + + if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + value = kmalloc(map->value_size, GFP_USER); + if (!value) + goto free_key; + + err = -EFAULT; + if (copy_from_user(value, uvalue, map->value_size) != 0) + goto free_value; + + /* eBPF program that use maps are running under rcu_read_lock(), + * therefore all map accessors rely on this fact, so do the same here + */ + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, attr->flags); + rcu_read_unlock(); + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_DELETE_ELEM_LAST_FIELD key + +static int map_delete_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key; + int err; + + if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_delete_elem(map, key); + rcu_read_unlock(); + +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key + +static int map_get_next_key(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *unext_key = u64_to_ptr(attr->next_key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *next_key; + int err; + + if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + next_key = kmalloc(map->key_size, GFP_USER); + if (!next_key) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_get_next_key(map, key, next_key); + rcu_read_unlock(); + if (err) + goto free_next_key; + + err = -EFAULT; + if (copy_to_user(unext_key, next_key, map->key_size) != 0) + goto free_next_key; + + err = 0; + +free_next_key: + kfree(next_key); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +static LIST_HEAD(bpf_prog_types); + +static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) +{ + struct bpf_prog_type_list *tl; + + list_for_each_entry(tl, &bpf_prog_types, list_node) { + if (tl->type == type) { + prog->aux->ops = tl->ops; + prog->aux->prog_type = type; + return 0; + } + } + return -EINVAL; +} + +void bpf_register_prog_type(struct bpf_prog_type_list *tl) +{ + list_add(&tl->list_node, &bpf_prog_types); +} + +/* fixup insn->imm field of bpf_call instructions: + * if (insn->imm == BPF_FUNC_map_lookup_elem) + * insn->imm = bpf_map_lookup_elem - __bpf_call_base; + * else if (insn->imm == BPF_FUNC_map_update_elem) + * insn->imm = bpf_map_update_elem - __bpf_call_base; + * else ... + * + * this function is called after eBPF program passed verification + */ +static void fixup_bpf_calls(struct bpf_prog *prog) +{ + const struct bpf_func_proto *fn; + int i; + + for (i = 0; i < prog->len; i++) { + struct bpf_insn *insn = &prog->insnsi[i]; + + if (insn->code == (BPF_JMP | BPF_CALL)) { + /* we reach here when program has bpf_call instructions + * and it passed bpf_check(), means that + * ops->get_func_proto must have been supplied, check it + */ + BUG_ON(!prog->aux->ops->get_func_proto); + + fn = prog->aux->ops->get_func_proto(insn->imm); + /* all functions that have prototype and verifier allowed + * programs to call them, must be real in-kernel functions + */ + BUG_ON(!fn->func); + insn->imm = fn->func - __bpf_call_base; + } + } +} + +/* drop refcnt on maps used by eBPF program and free auxilary data */ +static void free_used_maps(struct bpf_prog_aux *aux) +{ + int i; + + for (i = 0; i < aux->used_map_cnt; i++) + bpf_map_put(aux->used_maps[i]); + + kfree(aux->used_maps); +} + +void bpf_prog_put(struct bpf_prog *prog) +{ + if (atomic_dec_and_test(&prog->aux->refcnt)) { + free_used_maps(prog->aux); + bpf_prog_free(prog); + } +} + +static int bpf_prog_release(struct inode *inode, struct file *filp) +{ + struct bpf_prog *prog = filp->private_data; + + bpf_prog_put(prog); + return 0; +} + +static const struct file_operations bpf_prog_fops = { + .release = bpf_prog_release, +}; + +static struct bpf_prog *get_prog(struct fd f) +{ + struct bpf_prog *prog; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_prog_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + prog = f.file->private_data; + + return prog; +} + +/* called by sockets/tracing/seccomp before attaching program to an event + * pairs with bpf_prog_put() + */ +struct bpf_prog *bpf_prog_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_prog *prog; + + prog = get_prog(f); + + if (IS_ERR(prog)) + return prog; + + atomic_inc(&prog->aux->refcnt); + fdput(f); + return prog; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_PROG_LOAD_LAST_FIELD log_buf + +static int bpf_prog_load(union bpf_attr *attr) +{ + enum bpf_prog_type type = attr->prog_type; + struct bpf_prog *prog; + int err; + char license[128]; + bool is_gpl; + + if (CHECK_ATTR(BPF_PROG_LOAD)) + return -EINVAL; + + /* copy eBPF program license from user space */ + if (strncpy_from_user(license, u64_to_ptr(attr->license), + sizeof(license) - 1) < 0) + return -EFAULT; + license[sizeof(license) - 1] = 0; + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + is_gpl = license_is_gpl_compatible(license); + + if (attr->insn_cnt >= BPF_MAXINSNS) + return -EINVAL; + + /* plain bpf_prog allocation */ + prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); + if (!prog) + return -ENOMEM; + + prog->len = attr->insn_cnt; + + err = -EFAULT; + if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), + prog->len * sizeof(struct bpf_insn)) != 0) + goto free_prog; + + prog->orig_prog = NULL; + prog->jited = false; + + atomic_set(&prog->aux->refcnt, 1); + prog->aux->is_gpl_compatible = is_gpl; + + /* find program type: socket_filter vs tracing_filter */ + err = find_prog_type(type, prog); + if (err < 0) + goto free_prog; + + /* run eBPF verifier */ + err = bpf_check(prog, attr); + + if (err < 0) + goto free_used_maps; + + /* fixup BPF_CALL->imm field */ + fixup_bpf_calls(prog); + + /* eBPF program is ready to be JITed */ + bpf_prog_select_runtime(prog); + + err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); + + if (err < 0) + /* failed to allocate fd */ + goto free_used_maps; + + return err; + +free_used_maps: + free_used_maps(prog->aux); +free_prog: + bpf_prog_free(prog); + return err; +} + +SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +{ + union bpf_attr attr = {}; + int err; + + /* the syscall is limited to root temporarily. This restriction will be + * lifted when security audit is clean. Note that eBPF+tracing must have + * this restriction, since it may pass kernel data to user space + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!access_ok(VERIFY_READ, uattr, 1)) + return -EFAULT; + + if (size > PAGE_SIZE) /* silly large */ + return -E2BIG; + + /* If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + err = get_user(val, addr); + if (err) + return err; + if (val) + return -E2BIG; + } + size = sizeof(attr); + } + + /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + if (copy_from_user(&attr, uattr, size) != 0) + return -EFAULT; + + switch (cmd) { + case BPF_MAP_CREATE: + err = map_create(&attr); + break; + case BPF_MAP_LOOKUP_ELEM: + err = map_lookup_elem(&attr); + break; + case BPF_MAP_UPDATE_ELEM: + err = map_update_elem(&attr); + break; + case BPF_MAP_DELETE_ELEM: + err = map_delete_elem(&attr); + break; + case BPF_MAP_GET_NEXT_KEY: + err = map_get_next_key(&attr); + break; + case BPF_PROG_LOAD: + err = bpf_prog_load(&attr); + break; + default: + err = -EINVAL; + break; + } + + return err; +} diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c new file mode 100644 index 000000000000..0ceae1e6e8b5 --- /dev/null +++ b/kernel/bpf/test_stub.c @@ -0,0 +1,78 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/bpf.h> + +/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC + * to be used by user space verifier testsuite + */ +struct bpf_context { + u64 arg1; + u64 arg2; +}; + +static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + default: + return NULL; + } +} + +static const struct bpf_context_access { + int size; + enum bpf_access_type type; +} test_ctx_access[] = { + [offsetof(struct bpf_context, arg1)] = { + FIELD_SIZEOF(struct bpf_context, arg1), + BPF_READ + }, + [offsetof(struct bpf_context, arg2)] = { + FIELD_SIZEOF(struct bpf_context, arg2), + BPF_READ + }, +}; + +static bool test_is_valid_access(int off, int size, enum bpf_access_type type) +{ + const struct bpf_context_access *access; + + if (off < 0 || off >= ARRAY_SIZE(test_ctx_access)) + return false; + + access = &test_ctx_access[off]; + if (access->size == size && (access->type & type)) + return true; + + return false; +} + +static struct bpf_verifier_ops test_ops = { + .get_func_proto = test_func_proto, + .is_valid_access = test_is_valid_access, +}; + +static struct bpf_prog_type_list tl_prog = { + .ops = &test_ops, + .type = BPF_PROG_TYPE_UNSPEC, +}; + +static int __init register_test_ops(void) +{ + bpf_register_prog_type(&tl_prog); + return 0; +} +late_initcall(register_test_ops); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c new file mode 100644 index 000000000000..a28e09c7825d --- /dev/null +++ b/kernel/bpf/verifier.c @@ -0,0 +1,2003 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <net/netlink.h> +#include <linux/file.h> +#include <linux/vmalloc.h> + +/* bpf_check() is a static code analyzer that walks eBPF program + * instruction by instruction and updates register/stack state. + * All paths of conditional branches are analyzed until 'bpf_exit' insn. + * + * The first pass is depth-first-search to check that the program is a DAG. + * It rejects the following programs: + * - larger than BPF_MAXINSNS insns + * - if loop is present (detected via back-edge) + * - unreachable insns exist (shouldn't be a forest. program = one function) + * - out of bounds or malformed jumps + * The second pass is all possible path descent from the 1st insn. + * Since it's analyzing all pathes through the program, the length of the + * analysis is limited to 32k insn, which may be hit even if total number of + * insn is less then 4K, but there are too many branches that change stack/regs. + * Number of 'branches to be analyzed' is limited to 1k + * + * On entry to each instruction, each register has a type, and the instruction + * changes the types of the registers depending on instruction semantics. + * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is + * copied to R1. + * + * All registers are 64-bit. + * R0 - return register + * R1-R5 argument passing registers + * R6-R9 callee saved registers + * R10 - frame pointer read-only + * + * At the start of BPF program the register R1 contains a pointer to bpf_context + * and has type PTR_TO_CTX. + * + * Verifier tracks arithmetic operations on pointers in case: + * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20), + * 1st insn copies R10 (which has FRAME_PTR) type into R1 + * and 2nd arithmetic instruction is pattern matched to recognize + * that it wants to construct a pointer to some element within stack. + * So after 2nd insn, the register R1 has type PTR_TO_STACK + * (and -20 constant is saved for further stack bounds checking). + * Meaning that this reg is a pointer to stack plus known immediate constant. + * + * Most of the time the registers have UNKNOWN_VALUE type, which + * means the register has some value, but it's not a valid pointer. + * (like pointer plus pointer becomes UNKNOWN_VALUE type) + * + * When verifier sees load or store instructions the type of base register + * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer + * types recognized by check_mem_access() function. + * + * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value' + * and the range of [ptr, ptr + map's value_size) is accessible. + * + * registers used to pass values to function calls are checked against + * function argument constraints. + * + * ARG_PTR_TO_MAP_KEY is one of such argument constraints. + * It means that the register type passed to this function must be + * PTR_TO_STACK and it will be used inside the function as + * 'pointer to map element key' + * + * For example the argument constraints for bpf_map_lookup_elem(): + * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + * .arg1_type = ARG_CONST_MAP_PTR, + * .arg2_type = ARG_PTR_TO_MAP_KEY, + * + * ret_type says that this function returns 'pointer to map elem value or null' + * function expects 1st argument to be a const pointer to 'struct bpf_map' and + * 2nd argument should be a pointer to stack, which will be used inside + * the helper function as a pointer to map element key. + * + * On the kernel side the helper function looks like: + * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) + * { + * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + * void *key = (void *) (unsigned long) r2; + * void *value; + * + * here kernel can access 'key' and 'map' pointers safely, knowing that + * [key, key + map->key_size) bytes are valid and were initialized on + * the stack of eBPF program. + * } + * + * Corresponding eBPF program may look like: + * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR + * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK + * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP + * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + * here verifier looks at prototype of map_lookup_elem() and sees: + * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok, + * Now verifier knows that this map has key of R1->map_ptr->key_size bytes + * + * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far, + * Now verifier checks that [R2, R2 + map's key_size) are within stack limits + * and were initialized prior to this call. + * If it's ok, then verifier allows this BPF_CALL insn and looks at + * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets + * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function + * returns ether pointer to map value or NULL. + * + * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off' + * insn, the register holding that pointer in the true branch changes state to + * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false + * branch. See check_cond_jmp_op(). + * + * After the call R0 is set to return type of the function and registers R1-R5 + * are set to NOT_INIT to indicate that they are no longer readable. + */ + +/* types of values stored in eBPF registers */ +enum bpf_reg_type { + NOT_INIT = 0, /* nothing was written into register */ + UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ + PTR_TO_CTX, /* reg points to bpf_context */ + CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ + PTR_TO_MAP_VALUE, /* reg points to map element value */ + PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ + FRAME_PTR, /* reg == frame_pointer */ + PTR_TO_STACK, /* reg == frame_pointer + imm */ + CONST_IMM, /* constant integer value */ +}; + +struct reg_state { + enum bpf_reg_type type; + union { + /* valid when type == CONST_IMM | PTR_TO_STACK */ + int imm; + + /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | + * PTR_TO_MAP_VALUE_OR_NULL + */ + struct bpf_map *map_ptr; + }; +}; + +enum bpf_stack_slot_type { + STACK_INVALID, /* nothing was stored in this stack slot */ + STACK_SPILL, /* register spilled into stack */ + STACK_MISC /* BPF program wrote some data into this slot */ +}; + +#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ + +/* state of the program: + * type of all registers and stack info + */ +struct verifier_state { + struct reg_state regs[MAX_BPF_REG]; + u8 stack_slot_type[MAX_BPF_STACK]; + struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; +}; + +/* linked list of verifier states used to prune search */ +struct verifier_state_list { + struct verifier_state state; + struct verifier_state_list *next; +}; + +/* verifier_state + insn_idx are pushed to stack when branch is encountered */ +struct verifier_stack_elem { + /* verifer state is 'st' + * before processing instruction 'insn_idx' + * and after processing instruction 'prev_insn_idx' + */ + struct verifier_state st; + int insn_idx; + int prev_insn_idx; + struct verifier_stack_elem *next; +}; + +#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ + +/* single container for all structs + * one verifier_env per bpf_check() call + */ +struct verifier_env { + struct bpf_prog *prog; /* eBPF program being verified */ + struct verifier_stack_elem *head; /* stack of verifier states to be processed */ + int stack_size; /* number of states to be processed */ + struct verifier_state cur_state; /* current verifier state */ + struct verifier_state_list **explored_states; /* search pruning optimization */ + struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ + u32 used_map_cnt; /* number of used maps */ +}; + +/* verbose verifier prints what it's seeing + * bpf_check() is called under lock, so no race to access these global vars + */ +static u32 log_level, log_size, log_len; +static char *log_buf; + +static DEFINE_MUTEX(bpf_verifier_lock); + +/* log_level controls verbosity level of eBPF verifier. + * verbose() is used to dump the verification trace to the log, so the user + * can figure out what's wrong with the program + */ +static void verbose(const char *fmt, ...) +{ + va_list args; + + if (log_level == 0 || log_len >= log_size - 1) + return; + + va_start(args, fmt); + log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); + va_end(args); +} + +/* string representation of 'enum bpf_reg_type' */ +static const char * const reg_type_str[] = { + [NOT_INIT] = "?", + [UNKNOWN_VALUE] = "inv", + [PTR_TO_CTX] = "ctx", + [CONST_PTR_TO_MAP] = "map_ptr", + [PTR_TO_MAP_VALUE] = "map_value", + [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", + [FRAME_PTR] = "fp", + [PTR_TO_STACK] = "fp", + [CONST_IMM] = "imm", +}; + +static void print_verifier_state(struct verifier_env *env) +{ + enum bpf_reg_type t; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + t = env->cur_state.regs[i].type; + if (t == NOT_INIT) + continue; + verbose(" R%d=%s", i, reg_type_str[t]); + if (t == CONST_IMM || t == PTR_TO_STACK) + verbose("%d", env->cur_state.regs[i].imm); + else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || + t == PTR_TO_MAP_VALUE_OR_NULL) + verbose("(ks=%d,vs=%d)", + env->cur_state.regs[i].map_ptr->key_size, + env->cur_state.regs[i].map_ptr->value_size); + } + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (env->cur_state.stack_slot_type[i] == STACK_SPILL) + verbose(" fp%d=%s", -MAX_BPF_STACK + i, + reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]); + } + verbose("\n"); +} + +static const char *const bpf_class_string[] = { + [BPF_LD] = "ld", + [BPF_LDX] = "ldx", + [BPF_ST] = "st", + [BPF_STX] = "stx", + [BPF_ALU] = "alu", + [BPF_JMP] = "jmp", + [BPF_RET] = "BUG", + [BPF_ALU64] = "alu64", +}; + +static const char *const bpf_alu_string[] = { + [BPF_ADD >> 4] = "+=", + [BPF_SUB >> 4] = "-=", + [BPF_MUL >> 4] = "*=", + [BPF_DIV >> 4] = "/=", + [BPF_OR >> 4] = "|=", + [BPF_AND >> 4] = "&=", + [BPF_LSH >> 4] = "<<=", + [BPF_RSH >> 4] = ">>=", + [BPF_NEG >> 4] = "neg", + [BPF_MOD >> 4] = "%=", + [BPF_XOR >> 4] = "^=", + [BPF_MOV >> 4] = "=", + [BPF_ARSH >> 4] = "s>>=", + [BPF_END >> 4] = "endian", +}; + +static const char *const bpf_ldst_string[] = { + [BPF_W >> 3] = "u32", + [BPF_H >> 3] = "u16", + [BPF_B >> 3] = "u8", + [BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[] = { + [BPF_JA >> 4] = "jmp", + [BPF_JEQ >> 4] = "==", + [BPF_JGT >> 4] = ">", + [BPF_JGE >> 4] = ">=", + [BPF_JSET >> 4] = "&", + [BPF_JNE >> 4] = "!=", + [BPF_JSGT >> 4] = "s>", + [BPF_JSGE >> 4] = "s>=", + [BPF_CALL >> 4] = "call", + [BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_insn(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (BPF_SRC(insn->code) == BPF_X) + verbose("(%02x) %sr%d %s %sr%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->src_reg); + else + verbose("(%02x) %sr%d %s %s%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->imm); + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_MEM) + verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->src_reg); + else if (BPF_MODE(insn->code) == BPF_XADD) + verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + else + verbose("BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_st_%02x\n", insn->code); + return; + } + verbose("(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_ldx_%02x\n", insn->code); + return; + } + verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (class == BPF_LD) { + if (BPF_MODE(insn->code) == BPF_ABS) { + verbose("(%02x) r0 = *(%s *)skb[%d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IND) { + verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM) { + verbose("(%02x) r%d = 0x%x\n", + insn->code, insn->dst_reg, insn->imm); + } else { + verbose("BUG_ld_%02x\n", insn->code); + return; + } + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + verbose("(%02x) call %d\n", insn->code, insn->imm); + } else if (insn->code == (BPF_JMP | BPF_JA)) { + verbose("(%02x) goto pc%+d\n", + insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_EXIT)) { + verbose("(%02x) exit\n", insn->code); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose("(%02x) if r%d %s r%d goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->src_reg, insn->off); + } else { + verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->imm, insn->off); + } + } else { + verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); + } +} + +static int pop_stack(struct verifier_env *env, int *prev_insn_idx) +{ + struct verifier_stack_elem *elem; + int insn_idx; + + if (env->head == NULL) + return -1; + + memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state)); + insn_idx = env->head->insn_idx; + if (prev_insn_idx) + *prev_insn_idx = env->head->prev_insn_idx; + elem = env->head->next; + kfree(env->head); + env->head = elem; + env->stack_size--; + return insn_idx; +} + +static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, + int prev_insn_idx) +{ + struct verifier_stack_elem *elem; + + elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); + if (!elem) + goto err; + + memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state)); + elem->insn_idx = insn_idx; + elem->prev_insn_idx = prev_insn_idx; + elem->next = env->head; + env->head = elem; + env->stack_size++; + if (env->stack_size > 1024) { + verbose("BPF program is too complex\n"); + goto err; + } + return &elem->st; +err: + /* pop all elements and return */ + while (pop_stack(env, NULL) >= 0); + return NULL; +} + +#define CALLER_SAVED_REGS 6 +static const int caller_saved[CALLER_SAVED_REGS] = { + BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 +}; + +static void init_reg_state(struct reg_state *regs) +{ + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + regs[i].type = NOT_INIT; + regs[i].imm = 0; + regs[i].map_ptr = NULL; + } + + /* frame pointer */ + regs[BPF_REG_FP].type = FRAME_PTR; + + /* 1st arg to a function */ + regs[BPF_REG_1].type = PTR_TO_CTX; +} + +static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + regs[regno].type = UNKNOWN_VALUE; + regs[regno].imm = 0; + regs[regno].map_ptr = NULL; +} + +enum reg_arg_type { + SRC_OP, /* register is used as source operand */ + DST_OP, /* register is used as destination operand */ + DST_OP_NO_MARK /* same as above, check only, don't mark */ +}; + +static int check_reg_arg(struct reg_state *regs, u32 regno, + enum reg_arg_type t) +{ + if (regno >= MAX_BPF_REG) { + verbose("R%d is invalid\n", regno); + return -EINVAL; + } + + if (t == SRC_OP) { + /* check whether register used as source operand can be read */ + if (regs[regno].type == NOT_INIT) { + verbose("R%d !read_ok\n", regno); + return -EACCES; + } + } else { + /* check whether register used as dest operand can be written to */ + if (regno == BPF_REG_FP) { + verbose("frame pointer is read only\n"); + return -EACCES; + } + if (t == DST_OP) + mark_reg_unknown_value(regs, regno); + } + return 0; +} + +static int bpf_size_to_bytes(int bpf_size) +{ + if (bpf_size == BPF_W) + return 4; + else if (bpf_size == BPF_H) + return 2; + else if (bpf_size == BPF_B) + return 1; + else if (bpf_size == BPF_DW) + return 8; + else + return -EINVAL; +} + +/* check_stack_read/write functions track spill/fill of registers, + * stack boundary and alignment are checked in check_mem_access() + */ +static int check_stack_write(struct verifier_state *state, int off, int size, + int value_regno) +{ + int i; + /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, + * so it's aligned access and [off, off + size) are within stack limits + */ + + if (value_regno >= 0 && + (state->regs[value_regno].type == PTR_TO_MAP_VALUE || + state->regs[value_regno].type == PTR_TO_STACK || + state->regs[value_regno].type == PTR_TO_CTX)) { + + /* register containing pointer is being spilled into stack */ + if (size != BPF_REG_SIZE) { + verbose("invalid size of register spill\n"); + return -EACCES; + } + + /* save register state */ + state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = + state->regs[value_regno]; + + for (i = 0; i < BPF_REG_SIZE; i++) + state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; + } else { + /* regular write of data into stack */ + state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = + (struct reg_state) {}; + + for (i = 0; i < size; i++) + state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; + } + return 0; +} + +static int check_stack_read(struct verifier_state *state, int off, int size, + int value_regno) +{ + u8 *slot_type; + int i; + + slot_type = &state->stack_slot_type[MAX_BPF_STACK + off]; + + if (slot_type[0] == STACK_SPILL) { + if (size != BPF_REG_SIZE) { + verbose("invalid size of register spill\n"); + return -EACCES; + } + for (i = 1; i < BPF_REG_SIZE; i++) { + if (slot_type[i] != STACK_SPILL) { + verbose("corrupted spill memory\n"); + return -EACCES; + } + } + + if (value_regno >= 0) + /* restore register state from stack */ + state->regs[value_regno] = + state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE]; + return 0; + } else { + for (i = 0; i < size; i++) { + if (slot_type[i] != STACK_MISC) { + verbose("invalid read from stack off %d+%d size %d\n", + off, i, size); + return -EACCES; + } + } + if (value_regno >= 0) + /* have read misc data from the stack */ + mark_reg_unknown_value(state->regs, value_regno); + return 0; + } +} + +/* check read/write into map element returned by bpf_map_lookup_elem() */ +static int check_map_access(struct verifier_env *env, u32 regno, int off, + int size) +{ + struct bpf_map *map = env->cur_state.regs[regno].map_ptr; + + if (off < 0 || off + size > map->value_size) { + verbose("invalid access to map value, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + return 0; +} + +/* check access to 'struct bpf_context' fields */ +static int check_ctx_access(struct verifier_env *env, int off, int size, + enum bpf_access_type t) +{ + if (env->prog->aux->ops->is_valid_access && + env->prog->aux->ops->is_valid_access(off, size, t)) + return 0; + + verbose("invalid bpf_context access off=%d size=%d\n", off, size); + return -EACCES; +} + +/* check whether memory at (regno + off) is accessible for t = (read | write) + * if t==write, value_regno is a register which value is stored into memory + * if t==read, value_regno is a register which will receive the value from memory + * if t==write && value_regno==-1, some unknown value is stored into memory + * if t==read && value_regno==-1, don't care what we read from memory + */ +static int check_mem_access(struct verifier_env *env, u32 regno, int off, + int bpf_size, enum bpf_access_type t, + int value_regno) +{ + struct verifier_state *state = &env->cur_state; + int size, err = 0; + + size = bpf_size_to_bytes(bpf_size); + if (size < 0) + return size; + + if (off % size != 0) { + verbose("misaligned access off %d size %d\n", off, size); + return -EACCES; + } + + if (state->regs[regno].type == PTR_TO_MAP_VALUE) { + err = check_map_access(env, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); + + } else if (state->regs[regno].type == PTR_TO_CTX) { + err = check_ctx_access(env, off, size, t); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); + + } else if (state->regs[regno].type == FRAME_PTR) { + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose("invalid stack off=%d size=%d\n", off, size); + return -EACCES; + } + if (t == BPF_WRITE) + err = check_stack_write(state, off, size, value_regno); + else + err = check_stack_read(state, off, size, value_regno); + } else { + verbose("R%d invalid mem access '%s'\n", + regno, reg_type_str[state->regs[regno].type]); + return -EACCES; + } + return err; +} + +static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + int err; + + if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || + insn->imm != 0) { + verbose("BPF_XADD uses reserved fields\n"); + return -EINVAL; + } + + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check whether atomic_add can read the memory */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, -1); + if (err) + return err; + + /* check whether atomic_add can write into the same memory */ + return check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, -1); +} + +/* when register 'regno' is passed into function that will read 'access_size' + * bytes from that pointer, make sure that it's within stack boundary + * and all elements of stack are initialized + */ +static int check_stack_boundary(struct verifier_env *env, + int regno, int access_size) +{ + struct verifier_state *state = &env->cur_state; + struct reg_state *regs = state->regs; + int off, i; + + if (regs[regno].type != PTR_TO_STACK) + return -EACCES; + + off = regs[regno].imm; + if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || + access_size <= 0) { + verbose("invalid stack type R%d off=%d access_size=%d\n", + regno, off, access_size); + return -EACCES; + } + + for (i = 0; i < access_size; i++) { + if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { + verbose("invalid indirect read from stack off %d+%d size %d\n", + off, i, access_size); + return -EACCES; + } + } + return 0; +} + +static int check_func_arg(struct verifier_env *env, u32 regno, + enum bpf_arg_type arg_type, struct bpf_map **mapp) +{ + struct reg_state *reg = env->cur_state.regs + regno; + enum bpf_reg_type expected_type; + int err = 0; + + if (arg_type == ARG_ANYTHING) + return 0; + + if (reg->type == NOT_INIT) { + verbose("R%d !read_ok\n", regno); + return -EACCES; + } + + if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || + arg_type == ARG_PTR_TO_MAP_VALUE) { + expected_type = PTR_TO_STACK; + } else if (arg_type == ARG_CONST_STACK_SIZE) { + expected_type = CONST_IMM; + } else if (arg_type == ARG_CONST_MAP_PTR) { + expected_type = CONST_PTR_TO_MAP; + } else { + verbose("unsupported arg_type %d\n", arg_type); + return -EFAULT; + } + + if (reg->type != expected_type) { + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[reg->type], reg_type_str[expected_type]); + return -EACCES; + } + + if (arg_type == ARG_CONST_MAP_PTR) { + /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ + *mapp = reg->map_ptr; + + } else if (arg_type == ARG_PTR_TO_MAP_KEY) { + /* bpf_map_xxx(..., map_ptr, ..., key) call: + * check that [key, key + map->key_size) are within + * stack limits and initialized + */ + if (!*mapp) { + /* in function declaration map_ptr must come before + * map_key, so that it's verified and known before + * we have to check map_key here. Otherwise it means + * that kernel subsystem misconfigured verifier + */ + verbose("invalid map_ptr to access map->key\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno, (*mapp)->key_size); + + } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { + /* bpf_map_xxx(..., map_ptr, ..., value) call: + * check [value, value + map->value_size) validity + */ + if (!*mapp) { + /* kernel subsystem misconfigured verifier */ + verbose("invalid map_ptr to access map->value\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno, (*mapp)->value_size); + + } else if (arg_type == ARG_CONST_STACK_SIZE) { + /* bpf_xxx(..., buf, len) call will access 'len' bytes + * from stack pointer 'buf'. Check it + * note: regno == len, regno - 1 == buf + */ + if (regno == 0) { + /* kernel subsystem misconfigured verifier */ + verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno - 1, reg->imm); + } + + return err; +} + +static int check_call(struct verifier_env *env, int func_id) +{ + struct verifier_state *state = &env->cur_state; + const struct bpf_func_proto *fn = NULL; + struct reg_state *regs = state->regs; + struct bpf_map *map = NULL; + struct reg_state *reg; + int i, err; + + /* find function prototype */ + if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { + verbose("invalid func %d\n", func_id); + return -EINVAL; + } + + if (env->prog->aux->ops->get_func_proto) + fn = env->prog->aux->ops->get_func_proto(func_id); + + if (!fn) { + verbose("unknown func %d\n", func_id); + return -EINVAL; + } + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) { + verbose("cannot call GPL only function from proprietary program\n"); + return -EINVAL; + } + + /* check args */ + err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map); + if (err) + return err; + + /* reset caller saved regs */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + reg = regs + caller_saved[i]; + reg->type = NOT_INIT; + reg->imm = 0; + } + + /* update return register */ + if (fn->ret_type == RET_INTEGER) { + regs[BPF_REG_0].type = UNKNOWN_VALUE; + } else if (fn->ret_type == RET_VOID) { + regs[BPF_REG_0].type = NOT_INIT; + } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + /* remember map_ptr, so that check_map_access() + * can check 'value_size' boundary of memory access + * to map element returned from bpf_map_lookup_elem() + */ + if (map == NULL) { + verbose("kernel subsystem misconfigured verifier\n"); + return -EINVAL; + } + regs[BPF_REG_0].map_ptr = map; + } else { + verbose("unknown return type %d of func %d\n", + fn->ret_type, func_id); + return -EINVAL; + } + return 0; +} + +/* check validity of 32-bit and 64-bit arithmetic operations */ +static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) +{ + u8 opcode = BPF_OP(insn->code); + int err; + + if (opcode == BPF_END || opcode == BPF_NEG) { + if (opcode == BPF_NEG) { + if (BPF_SRC(insn->code) != 0 || + insn->src_reg != BPF_REG_0 || + insn->off != 0 || insn->imm != 0) { + verbose("BPF_NEG uses reserved fields\n"); + return -EINVAL; + } + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0 || + (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) { + verbose("BPF_END uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + } else if (opcode == BPF_MOV) { + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0 || insn->off != 0) { + verbose("BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + + /* check src operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + verbose("BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + } + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (BPF_SRC(insn->code) == BPF_X) { + if (BPF_CLASS(insn->code) == BPF_ALU64) { + /* case: R1 = R2 + * copy register state to dest reg + */ + regs[insn->dst_reg] = regs[insn->src_reg]; + } else { + regs[insn->dst_reg].type = UNKNOWN_VALUE; + regs[insn->dst_reg].map_ptr = NULL; + } + } else { + /* case: R = imm + * remember the value we stored into this reg + */ + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = insn->imm; + } + + } else if (opcode > BPF_END) { + verbose("invalid BPF_ALU opcode %x\n", opcode); + return -EINVAL; + + } else { /* all other ALU ops: and, sub, xor, add, ... */ + + bool stack_relative = false; + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0 || insn->off != 0) { + verbose("BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + verbose("BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + if ((opcode == BPF_MOD || opcode == BPF_DIV) && + BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { + verbose("div by zero\n"); + return -EINVAL; + } + + /* pattern match 'bpf_add Rx, imm' instruction */ + if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && + regs[insn->dst_reg].type == FRAME_PTR && + BPF_SRC(insn->code) == BPF_K) + stack_relative = true; + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (stack_relative) { + regs[insn->dst_reg].type = PTR_TO_STACK; + regs[insn->dst_reg].imm = insn->imm; + } + } + + return 0; +} + +static int check_cond_jmp_op(struct verifier_env *env, + struct bpf_insn *insn, int *insn_idx) +{ + struct reg_state *regs = env->cur_state.regs; + struct verifier_state *other_branch; + u8 opcode = BPF_OP(insn->code); + int err; + + if (opcode > BPF_EXIT) { + verbose("invalid BPF_JMP opcode %x\n", opcode); + return -EINVAL; + } + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0) { + verbose("BPF_JMP uses reserved fields\n"); + return -EINVAL; + } + + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0) { + verbose("BPF_JMP uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* detect if R == 0 where R was initialized to zero earlier */ + if (BPF_SRC(insn->code) == BPF_K && + (opcode == BPF_JEQ || opcode == BPF_JNE) && + regs[insn->dst_reg].type == CONST_IMM && + regs[insn->dst_reg].imm == insn->imm) { + if (opcode == BPF_JEQ) { + /* if (imm == imm) goto pc+off; + * only follow the goto, ignore fall-through + */ + *insn_idx += insn->off; + return 0; + } else { + /* if (imm != imm) goto pc+off; + * only follow fall-through branch, since + * that's where the program will go + */ + return 0; + } + } + + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); + if (!other_branch) + return -EFAULT; + + /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ + if (BPF_SRC(insn->code) == BPF_K && + insn->imm == 0 && (opcode == BPF_JEQ || + opcode == BPF_JNE) && + regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) { + if (opcode == BPF_JEQ) { + /* next fallthrough insn can access memory via + * this register + */ + regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + /* branch targer cannot access it, since reg == 0 */ + other_branch->regs[insn->dst_reg].type = CONST_IMM; + other_branch->regs[insn->dst_reg].imm = 0; + } else { + other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = 0; + } + } else if (BPF_SRC(insn->code) == BPF_K && + (opcode == BPF_JEQ || opcode == BPF_JNE)) { + + if (opcode == BPF_JEQ) { + /* detect if (R == imm) goto + * and in the target state recognize that R = imm + */ + other_branch->regs[insn->dst_reg].type = CONST_IMM; + other_branch->regs[insn->dst_reg].imm = insn->imm; + } else { + /* detect if (R != imm) goto + * and in the fall-through state recognize that R = imm + */ + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = insn->imm; + } + } + if (log_level) + print_verifier_state(env); + return 0; +} + +/* return the map pointer stored inside BPF_LD_IMM64 instruction */ +static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) +{ + u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; + + return (struct bpf_map *) (unsigned long) imm64; +} + +/* verify BPF_LD_IMM64 instruction */ +static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + int err; + + if (BPF_SIZE(insn->code) != BPF_DW) { + verbose("invalid BPF_LD_IMM insn\n"); + return -EINVAL; + } + if (insn->off != 0) { + verbose("BPF_LD_IMM64 uses reserved fields\n"); + return -EINVAL; + } + + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (insn->src_reg == 0) + /* generic move 64-bit immediate into a register */ + return 0; + + /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ + BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); + + regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); + return 0; +} + +/* verify safety of LD_ABS|LD_IND instructions: + * - they can only appear in the programs where ctx == skb + * - since they are wrappers of function calls, they scratch R1-R5 registers, + * preserve R6-R9, and store return value into R0 + * + * Implicit input: + * ctx == skb == R6 == CTX + * + * Explicit input: + * SRC == any register + * IMM == 32-bit immediate + * + * Output: + * R0 - 8/16/32-bit skb data converted to cpu endianness + */ +static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + u8 mode = BPF_MODE(insn->code); + struct reg_state *reg; + int i, err; + + if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { + verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); + return -EINVAL; + } + + if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || + (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { + verbose("BPF_LD_ABS uses reserved fields\n"); + return -EINVAL; + } + + /* check whether implicit source operand (register R6) is readable */ + err = check_reg_arg(regs, BPF_REG_6, SRC_OP); + if (err) + return err; + + if (regs[BPF_REG_6].type != PTR_TO_CTX) { + verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); + return -EINVAL; + } + + if (mode == BPF_IND) { + /* check explicit source operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } + + /* reset caller saved regs to unreadable */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + reg = regs + caller_saved[i]; + reg->type = NOT_INIT; + reg->imm = 0; + } + + /* mark destination R0 register as readable, since it contains + * the value fetched from the packet + */ + regs[BPF_REG_0].type = UNKNOWN_VALUE; + return 0; +} + +/* non-recursive DFS pseudo code + * 1 procedure DFS-iterative(G,v): + * 2 label v as discovered + * 3 let S be a stack + * 4 S.push(v) + * 5 while S is not empty + * 6 t <- S.pop() + * 7 if t is what we're looking for: + * 8 return t + * 9 for all edges e in G.adjacentEdges(t) do + * 10 if edge e is already labelled + * 11 continue with the next edge + * 12 w <- G.adjacentVertex(t,e) + * 13 if vertex w is not discovered and not explored + * 14 label e as tree-edge + * 15 label w as discovered + * 16 S.push(w) + * 17 continue at 5 + * 18 else if vertex w is discovered + * 19 label e as back-edge + * 20 else + * 21 // vertex w is explored + * 22 label e as forward- or cross-edge + * 23 label t as explored + * 24 S.pop() + * + * convention: + * 0x10 - discovered + * 0x11 - discovered and fall-through edge labelled + * 0x12 - discovered and fall-through and branch edges labelled + * 0x20 - explored + */ + +enum { + DISCOVERED = 0x10, + EXPLORED = 0x20, + FALLTHROUGH = 1, + BRANCH = 2, +}; + +#define STATE_LIST_MARK ((struct verifier_state_list *) -1L) + +static int *insn_stack; /* stack of insns to process */ +static int cur_stack; /* current stack index */ +static int *insn_state; + +/* t, w, e - match pseudo-code above: + * t - index of current instruction + * w - next instruction + * e - edge + */ +static int push_insn(int t, int w, int e, struct verifier_env *env) +{ + if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) + return 0; + + if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH)) + return 0; + + if (w < 0 || w >= env->prog->len) { + verbose("jump out of range from insn %d to %d\n", t, w); + return -EINVAL; + } + + if (e == BRANCH) + /* mark branch target for state pruning */ + env->explored_states[w] = STATE_LIST_MARK; + + if (insn_state[w] == 0) { + /* tree-edge */ + insn_state[t] = DISCOVERED | e; + insn_state[w] = DISCOVERED; + if (cur_stack >= env->prog->len) + return -E2BIG; + insn_stack[cur_stack++] = w; + return 1; + } else if ((insn_state[w] & 0xF0) == DISCOVERED) { + verbose("back-edge from insn %d to %d\n", t, w); + return -EINVAL; + } else if (insn_state[w] == EXPLORED) { + /* forward- or cross-edge */ + insn_state[t] = DISCOVERED | e; + } else { + verbose("insn state internal bug\n"); + return -EFAULT; + } + return 0; +} + +/* non-recursive depth-first-search to detect loops in BPF program + * loop == back-edge in directed graph + */ +static int check_cfg(struct verifier_env *env) +{ + struct bpf_insn *insns = env->prog->insnsi; + int insn_cnt = env->prog->len; + int ret = 0; + int i, t; + + insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + if (!insn_state) + return -ENOMEM; + + insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + if (!insn_stack) { + kfree(insn_state); + return -ENOMEM; + } + + insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ + insn_stack[0] = 0; /* 0 is the first instruction */ + cur_stack = 1; + +peek_stack: + if (cur_stack == 0) + goto check_state; + t = insn_stack[cur_stack - 1]; + + if (BPF_CLASS(insns[t].code) == BPF_JMP) { + u8 opcode = BPF_OP(insns[t].code); + + if (opcode == BPF_EXIT) { + goto mark_explored; + } else if (opcode == BPF_CALL) { + ret = push_insn(t, t + 1, FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } else if (opcode == BPF_JA) { + if (BPF_SRC(insns[t].code) != BPF_K) { + ret = -EINVAL; + goto err_free; + } + /* unconditional jump with single edge */ + ret = push_insn(t, t + insns[t].off + 1, + FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + /* tell verifier to check for equivalent states + * after every call and jump + */ + env->explored_states[t + 1] = STATE_LIST_MARK; + } else { + /* conditional jump with two edges */ + ret = push_insn(t, t + 1, FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + + ret = push_insn(t, t + insns[t].off + 1, BRANCH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } + } else { + /* all other non-branch instructions with single + * fall-through edge + */ + ret = push_insn(t, t + 1, FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } + +mark_explored: + insn_state[t] = EXPLORED; + if (cur_stack-- <= 0) { + verbose("pop stack internal bug\n"); + ret = -EFAULT; + goto err_free; + } + goto peek_stack; + +check_state: + for (i = 0; i < insn_cnt; i++) { + if (insn_state[i] != EXPLORED) { + verbose("unreachable insn %d\n", i); + ret = -EINVAL; + goto err_free; + } + } + ret = 0; /* cfg looks good */ + +err_free: + kfree(insn_state); + kfree(insn_stack); + return ret; +} + +/* compare two verifier states + * + * all states stored in state_list are known to be valid, since + * verifier reached 'bpf_exit' instruction through them + * + * this function is called when verifier exploring different branches of + * execution popped from the state stack. If it sees an old state that has + * more strict register state and more strict stack state then this execution + * branch doesn't need to be explored further, since verifier already + * concluded that more strict state leads to valid finish. + * + * Therefore two states are equivalent if register state is more conservative + * and explored stack state is more conservative than the current one. + * Example: + * explored current + * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC) + * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC) + * + * In other words if current stack state (one being explored) has more + * valid slots than old one that already passed validation, it means + * the verifier can stop exploring and conclude that current state is valid too + * + * Similarly with registers. If explored state has register type as invalid + * whereas register type in current state is meaningful, it means that + * the current state will reach 'bpf_exit' instruction safely + */ +static bool states_equal(struct verifier_state *old, struct verifier_state *cur) +{ + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + if (memcmp(&old->regs[i], &cur->regs[i], + sizeof(old->regs[0])) != 0) { + if (old->regs[i].type == NOT_INIT || + (old->regs[i].type == UNKNOWN_VALUE && + cur->regs[i].type != NOT_INIT)) + continue; + return false; + } + } + + for (i = 0; i < MAX_BPF_STACK; i++) { + if (old->stack_slot_type[i] == STACK_INVALID) + continue; + if (old->stack_slot_type[i] != cur->stack_slot_type[i]) + /* Ex: old explored (safe) state has STACK_SPILL in + * this stack slot, but current has has STACK_MISC -> + * this verifier states are not equivalent, + * return false to continue verification of this path + */ + return false; + if (i % BPF_REG_SIZE) + continue; + if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], + &cur->spilled_regs[i / BPF_REG_SIZE], + sizeof(old->spilled_regs[0]))) + /* when explored and current stack slot types are + * the same, check that stored pointers types + * are the same as well. + * Ex: explored safe path could have stored + * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8} + * but current path has stored: + * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16} + * such verifier states are not equivalent. + * return false to continue verification of this path + */ + return false; + else + continue; + } + return true; +} + +static int is_state_visited(struct verifier_env *env, int insn_idx) +{ + struct verifier_state_list *new_sl; + struct verifier_state_list *sl; + + sl = env->explored_states[insn_idx]; + if (!sl) + /* this 'insn_idx' instruction wasn't marked, so we will not + * be doing state search here + */ + return 0; + + while (sl != STATE_LIST_MARK) { + if (states_equal(&sl->state, &env->cur_state)) + /* reached equivalent register/stack state, + * prune the search + */ + return 1; + sl = sl->next; + } + + /* there were no equivalent states, remember current one. + * technically the current state is not proven to be safe yet, + * but it will either reach bpf_exit (which means it's safe) or + * it will be rejected. Since there are no loops, we won't be + * seeing this 'insn_idx' instruction again on the way to bpf_exit + */ + new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER); + if (!new_sl) + return -ENOMEM; + + /* add new state to the head of linked list */ + memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state)); + new_sl->next = env->explored_states[insn_idx]; + env->explored_states[insn_idx] = new_sl; + return 0; +} + +static int do_check(struct verifier_env *env) +{ + struct verifier_state *state = &env->cur_state; + struct bpf_insn *insns = env->prog->insnsi; + struct reg_state *regs = state->regs; + int insn_cnt = env->prog->len; + int insn_idx, prev_insn_idx = 0; + int insn_processed = 0; + bool do_print_state = false; + + init_reg_state(regs); + insn_idx = 0; + for (;;) { + struct bpf_insn *insn; + u8 class; + int err; + + if (insn_idx >= insn_cnt) { + verbose("invalid insn idx %d insn_cnt %d\n", + insn_idx, insn_cnt); + return -EFAULT; + } + + insn = &insns[insn_idx]; + class = BPF_CLASS(insn->code); + + if (++insn_processed > 32768) { + verbose("BPF program is too large. Proccessed %d insn\n", + insn_processed); + return -E2BIG; + } + + err = is_state_visited(env, insn_idx); + if (err < 0) + return err; + if (err == 1) { + /* found equivalent state, can prune the search */ + if (log_level) { + if (do_print_state) + verbose("\nfrom %d to %d: safe\n", + prev_insn_idx, insn_idx); + else + verbose("%d: safe\n", insn_idx); + } + goto process_bpf_exit; + } + + if (log_level && do_print_state) { + verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); + print_verifier_state(env); + do_print_state = false; + } + + if (log_level) { + verbose("%d: ", insn_idx); + print_bpf_insn(insn); + } + + if (class == BPF_ALU || class == BPF_ALU64) { + err = check_alu_op(regs, insn); + if (err) + return err; + + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0) { + verbose("BPF_LDX uses reserved fields\n"); + return -EINVAL; + } + /* check src operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + + err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + /* check that memory (src_reg + off) is readable, + * the state of dst_reg will be updated by this func + */ + err = check_mem_access(env, insn->src_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, + insn->dst_reg); + if (err) + return err; + + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_XADD) { + err = check_xadd(env, insn); + if (err) + return err; + insn_idx++; + continue; + } + + if (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0) { + verbose("BPF_STX uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, + insn->src_reg); + if (err) + return err; + + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM || + insn->src_reg != BPF_REG_0) { + verbose("BPF_ST uses reserved fields\n"); + return -EINVAL; + } + /* check src operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, + -1); + if (err) + return err; + + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + if (BPF_SRC(insn->code) != BPF_K || + insn->off != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_CALL uses reserved fields\n"); + return -EINVAL; + } + + err = check_call(env, insn->imm); + if (err) + return err; + + } else if (opcode == BPF_JA) { + if (BPF_SRC(insn->code) != BPF_K || + insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_JA uses reserved fields\n"); + return -EINVAL; + } + + insn_idx += insn->off + 1; + continue; + + } else if (opcode == BPF_EXIT) { + if (BPF_SRC(insn->code) != BPF_K || + insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_EXIT uses reserved fields\n"); + return -EINVAL; + } + + /* eBPF calling convetion is such that R0 is used + * to return the value from eBPF program. + * Make sure that it's readable at this time + * of bpf_exit, which means that program wrote + * something into it earlier + */ + err = check_reg_arg(regs, BPF_REG_0, SRC_OP); + if (err) + return err; + +process_bpf_exit: + insn_idx = pop_stack(env, &prev_insn_idx); + if (insn_idx < 0) { + break; + } else { + do_print_state = true; + continue; + } + } else { + err = check_cond_jmp_op(env, insn, &insn_idx); + if (err) + return err; + } + } else if (class == BPF_LD) { + u8 mode = BPF_MODE(insn->code); + + if (mode == BPF_ABS || mode == BPF_IND) { + err = check_ld_abs(env, insn); + if (err) + return err; + + } else if (mode == BPF_IMM) { + err = check_ld_imm(env, insn); + if (err) + return err; + + insn_idx++; + } else { + verbose("invalid BPF_LD mode\n"); + return -EINVAL; + } + } else { + verbose("unknown insn class %d\n", class); + return -EINVAL; + } + + insn_idx++; + } + + return 0; +} + +/* look for pseudo eBPF instructions that access map FDs and + * replace them with actual map pointers + */ +static int replace_map_fd_with_map_ptr(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i, j; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { + struct bpf_map *map; + struct fd f; + + if (i == insn_cnt - 1 || insn[1].code != 0 || + insn[1].dst_reg != 0 || insn[1].src_reg != 0 || + insn[1].off != 0) { + verbose("invalid bpf_ld_imm64 insn\n"); + return -EINVAL; + } + + if (insn->src_reg == 0) + /* valid generic load 64-bit imm */ + goto next_insn; + + if (insn->src_reg != BPF_PSEUDO_MAP_FD) { + verbose("unrecognized bpf_ld_imm64 insn\n"); + return -EINVAL; + } + + f = fdget(insn->imm); + + map = bpf_map_get(f); + if (IS_ERR(map)) { + verbose("fd %d is not pointing to valid bpf_map\n", + insn->imm); + fdput(f); + return PTR_ERR(map); + } + + /* store map pointer inside BPF_LD_IMM64 instruction */ + insn[0].imm = (u32) (unsigned long) map; + insn[1].imm = ((u64) (unsigned long) map) >> 32; + + /* check whether we recorded this map already */ + for (j = 0; j < env->used_map_cnt; j++) + if (env->used_maps[j] == map) { + fdput(f); + goto next_insn; + } + + if (env->used_map_cnt >= MAX_USED_MAPS) { + fdput(f); + return -E2BIG; + } + + /* remember this map */ + env->used_maps[env->used_map_cnt++] = map; + + /* hold the map. If the program is rejected by verifier, + * the map will be released by release_maps() or it + * will be used by the valid program until it's unloaded + * and all maps are released in free_bpf_prog_info() + */ + atomic_inc(&map->refcnt); + + fdput(f); +next_insn: + insn++; + i++; + } + } + + /* now all pseudo BPF_LD_IMM64 instructions load valid + * 'struct bpf_map *' into a register instead of user map_fd. + * These pointers will be used later by verifier to validate map access. + */ + return 0; +} + +/* drop refcnt of maps used by the rejected program */ +static void release_maps(struct verifier_env *env) +{ + int i; + + for (i = 0; i < env->used_map_cnt; i++) + bpf_map_put(env->used_maps[i]); +} + +/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ +static void convert_pseudo_ld_imm64(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) + if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) + insn->src_reg = 0; +} + +static void free_states(struct verifier_env *env) +{ + struct verifier_state_list *sl, *sln; + int i; + + if (!env->explored_states) + return; + + for (i = 0; i < env->prog->len; i++) { + sl = env->explored_states[i]; + + if (sl) + while (sl != STATE_LIST_MARK) { + sln = sl->next; + kfree(sl); + sl = sln; + } + } + + kfree(env->explored_states); +} + +int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) +{ + char __user *log_ubuf = NULL; + struct verifier_env *env; + int ret = -EINVAL; + + if (prog->len <= 0 || prog->len > BPF_MAXINSNS) + return -E2BIG; + + /* 'struct verifier_env' can be global, but since it's not small, + * allocate/free it every time bpf_check() is called + */ + env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); + if (!env) + return -ENOMEM; + + env->prog = prog; + + /* grab the mutex to protect few globals used by verifier */ + mutex_lock(&bpf_verifier_lock); + + if (attr->log_level || attr->log_buf || attr->log_size) { + /* user requested verbose verifier output + * and supplied buffer to store the verification trace + */ + log_level = attr->log_level; + log_ubuf = (char __user *) (unsigned long) attr->log_buf; + log_size = attr->log_size; + log_len = 0; + + ret = -EINVAL; + /* log_* values have to be sane */ + if (log_size < 128 || log_size > UINT_MAX >> 8 || + log_level == 0 || log_ubuf == NULL) + goto free_env; + + ret = -ENOMEM; + log_buf = vmalloc(log_size); + if (!log_buf) + goto free_env; + } else { + log_level = 0; + } + + ret = replace_map_fd_with_map_ptr(env); + if (ret < 0) + goto skip_full_check; + + env->explored_states = kcalloc(prog->len, + sizeof(struct verifier_state_list *), + GFP_USER); + ret = -ENOMEM; + if (!env->explored_states) + goto skip_full_check; + + ret = check_cfg(env); + if (ret < 0) + goto skip_full_check; + + ret = do_check(env); + +skip_full_check: + while (pop_stack(env, NULL) >= 0); + free_states(env); + + if (log_level && log_len >= log_size - 1) { + BUG_ON(log_len >= log_size); + /* verifier log exceeded user supplied buffer */ + ret = -ENOSPC; + /* fall through to return what was recorded */ + } + + /* copy verifier log back to user space including trailing zero */ + if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { + ret = -EFAULT; + goto free_log_buf; + } + + if (ret == 0 && env->used_map_cnt) { + /* if program passed verifier, update used_maps in bpf_prog_info */ + prog->aux->used_maps = kmalloc_array(env->used_map_cnt, + sizeof(env->used_maps[0]), + GFP_KERNEL); + + if (!prog->aux->used_maps) { + ret = -ENOMEM; + goto free_log_buf; + } + + memcpy(prog->aux->used_maps, env->used_maps, + sizeof(env->used_maps[0]) * env->used_map_cnt); + prog->aux->used_map_cnt = env->used_map_cnt; + + /* program is valid. Convert pseudo bpf_ld_imm64 into generic + * bpf_ld_imm64 instructions + */ + convert_pseudo_ld_imm64(env); + } + +free_log_buf: + if (log_level) + vfree(log_buf); +free_env: + if (!prog->aux->used_maps) + /* if we didn't copy map pointers into bpf_prog_info, release + * them now. Otherwise free_bpf_prog_info() will release them. + */ + release_maps(env); + kfree(env); + mutex_unlock(&bpf_verifier_lock); + return ret; +} diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3a73f995a81e..bb263d0caab3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; -static void cgroup_put(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); @@ -195,7 +194,6 @@ static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); -static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, @@ -279,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, if (!(cgrp->root->subsys_mask & (1 << ss->id))) return NULL; + /* + * This function is used while updating css associations and thus + * can't test the csses directly. Use ->child_subsys_mask. + */ while (cgroup_parent(cgrp) && !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) cgrp = cgroup_parent(cgrp); @@ -286,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, return cgroup_css(cgrp, ss); } +/** + * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get the effective css of @cgrp for @ss. The effective css is + * defined as the matching css of the nearest ancestor including self which + * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, + * the root css is returned, so this function always returns a valid css. + * The returned css must be put using css_put(). + */ +struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + do { + css = cgroup_css(cgrp, ss); + + if (css && css_tryget_online(css)) + goto out_unlock; + cgrp = cgroup_parent(cgrp); + } while (cgrp); + + css = init_css_set.subsys[ss->id]; + css_get(css); +out_unlock: + rcu_read_unlock(); + return css; +} + /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { @@ -331,14 +366,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) return false; } -static int cgroup_is_releasable(const struct cgroup *cgrp) -{ - const int bits = - (1 << CGRP_RELEASABLE) | - (1 << CGRP_NOTIFY_ON_RELEASE); - return (cgrp->flags & bits) == bits; -} - static int notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -394,12 +421,7 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else -/* the list of cgroups eligible for automatic release. Protected by - * release_list_lock */ -static LIST_HEAD(release_list); -static DEFINE_RAW_SPINLOCK(release_list_lock); static void cgroup_release_agent(struct work_struct *work); -static DECLARE_WORK(release_agent_work, cgroup_release_agent); static void check_for_release(struct cgroup *cgrp); /* @@ -498,7 +520,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -static void put_css_set_locked(struct css_set *cset, bool taskexit) +static void put_css_set_locked(struct css_set *cset) { struct cgrp_cset_link *link, *tmp_link; struct cgroup_subsys *ss; @@ -524,11 +546,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) /* @cgrp can't go away while we're holding css_set_rwsem */ if (list_empty(&cgrp->cset_links)) { cgroup_update_populated(cgrp, false); - if (notify_on_release(cgrp)) { - if (taskexit) - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); - } + check_for_release(cgrp); } kfree(link); @@ -537,7 +555,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) kfree_rcu(cset, rcu_head); } -static void put_css_set(struct css_set *cset, bool taskexit) +static void put_css_set(struct css_set *cset) { /* * Ensure that the refcount doesn't hit zero while any readers @@ -548,7 +566,7 @@ static void put_css_set(struct css_set *cset, bool taskexit) return; down_write(&css_set_rwsem); - put_css_set_locked(cset, taskexit); + put_css_set_locked(cset); up_write(&css_set_rwsem); } @@ -969,14 +987,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * knows that the cgroup won't be removed, as cgroup_rmdir() * needs that mutex. * - * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't - * (usually) take cgroup_mutex. These are the two most performance - * critical pieces of code here. The exception occurs on cgroup_exit(), - * when a task in a notify_on_release cgroup exits. Then cgroup_mutex - * is taken, and if the cgroup count is zero, a usermode call made - * to the release agent with the name of the cgroup (path relative to - * the root of cgroup file system) as the argument. - * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all * tasks in the system use _some_ cgroup, and since there is always at @@ -1046,31 +1056,30 @@ static void cgroup_put(struct cgroup *cgrp) } /** - * cgroup_refresh_child_subsys_mask - update child_subsys_mask + * cgroup_calc_child_subsys_mask - calculate child_subsys_mask * @cgrp: the target cgroup + * @subtree_control: the new subtree_control mask to consider * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * - * This function determines which subsystems need to be enabled given the - * current @cgrp->subtree_control and records it in - * @cgrp->child_subsys_mask. The resulting mask is always a superset of - * @cgrp->subtree_control and follows the usual hierarchy rules. + * This function calculates which subsystems need to be enabled if + * @subtree_control is to be applied to @cgrp. The returned mask is always + * a superset of @subtree_control and follows the usual hierarchy rules. */ -static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) +static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp, + unsigned int subtree_control) { struct cgroup *parent = cgroup_parent(cgrp); - unsigned int cur_ss_mask = cgrp->subtree_control; + unsigned int cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&cgroup_mutex); - if (!cgroup_on_dfl(cgrp)) { - cgrp->child_subsys_mask = cur_ss_mask; - return; - } + if (!cgroup_on_dfl(cgrp)) + return cur_ss_mask; while (true) { unsigned int new_ss_mask = cur_ss_mask; @@ -1094,7 +1103,20 @@ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) cur_ss_mask = new_ss_mask; } - cgrp->child_subsys_mask = cur_ss_mask; + return cur_ss_mask; +} + +/** + * cgroup_refresh_child_subsys_mask - update child_subsys_mask + * @cgrp: the target cgroup + * + * Update @cgrp->child_subsys_mask according to the current + * @cgrp->subtree_control using cgroup_calc_child_subsys_mask(). + */ +static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) +{ + cgrp->child_subsys_mask = + cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control); } /** @@ -1587,7 +1609,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); INIT_LIST_HEAD(&cgrp->cset_links); - INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; @@ -1597,6 +1618,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); init_waitqueue_head(&cgrp->offline_waitq); + INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent); } static void init_cgroup_root(struct cgroup_root *root, @@ -1634,7 +1656,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; root_cgrp->id = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, + GFP_KERNEL); if (ret) goto out; @@ -2052,8 +2075,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, * task. As trading it for new_cset is protected by cgroup_mutex, * we're safe to drop it here; it will be freed under RCU. */ - set_bit(CGRP_RELEASABLE, &old_cgrp->flags); - put_css_set_locked(old_cset, false); + put_css_set_locked(old_cset); } /** @@ -2074,7 +2096,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) cset->mg_src_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); - put_css_set_locked(cset, false); + put_css_set_locked(cset); } up_write(&css_set_rwsem); } @@ -2168,8 +2190,8 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; list_del_init(&src_cset->mg_preload_node); - put_css_set(src_cset, false); - put_css_set(dst_cset, false); + put_css_set(src_cset); + put_css_set(dst_cset); continue; } @@ -2178,7 +2200,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, if (list_empty(&dst_cset->mg_preload_node)) list_add(&dst_cset->mg_preload_node, &csets); else - put_css_set(dst_cset, false); + put_css_set(dst_cset); } list_splice_tail(&csets, preloaded_csets); @@ -2668,7 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, loff_t off) { unsigned int enable = 0, disable = 0; - unsigned int css_enable, css_disable, old_ctrl, new_ctrl; + unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -2720,36 +2742,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, ret = -ENOENT; goto out_unlock; } - - /* - * @ss is already enabled through dependency and - * we'll just make it visible. Skip draining. - */ - if (cgrp->child_subsys_mask & (1 << ssid)) - continue; - - /* - * Because css offlining is asynchronous, userland - * might try to re-enable the same controller while - * the previous instance is still around. In such - * cases, wait till it's gone using offline_waitq. - */ - cgroup_for_each_live_child(child, cgrp) { - DEFINE_WAIT(wait); - - if (!cgroup_css(child, ss)) - continue; - - cgroup_get(child); - prepare_to_wait(&child->offline_waitq, &wait, - TASK_UNINTERRUPTIBLE); - cgroup_kn_unlock(of->kn); - schedule(); - finish_wait(&child->offline_waitq, &wait); - cgroup_put(child); - - return restart_syscall(); - } } else if (disable & (1 << ssid)) { if (!(cgrp->subtree_control & (1 << ssid))) { disable &= ~(1 << ssid); @@ -2785,19 +2777,48 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * subsystems than specified may need to be enabled or disabled * depending on subsystem dependencies. */ - cgrp->subtree_control |= enable; - cgrp->subtree_control &= ~disable; - - old_ctrl = cgrp->child_subsys_mask; - cgroup_refresh_child_subsys_mask(cgrp); - new_ctrl = cgrp->child_subsys_mask; + old_sc = cgrp->subtree_control; + old_ss = cgrp->child_subsys_mask; + new_sc = (old_sc | enable) & ~disable; + new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc); - css_enable = ~old_ctrl & new_ctrl; - css_disable = old_ctrl & ~new_ctrl; + css_enable = ~old_ss & new_ss; + css_disable = old_ss & ~new_ss; enable |= css_enable; disable |= css_disable; /* + * Because css offlining is asynchronous, userland might try to + * re-enable the same controller while the previous instance is + * still around. In such cases, wait till it's gone using + * offline_waitq. + */ + for_each_subsys(ss, ssid) { + if (!(css_enable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) { + DEFINE_WAIT(wait); + + if (!cgroup_css(child, ss)) + continue; + + cgroup_get(child); + prepare_to_wait(&child->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + cgroup_kn_unlock(of->kn); + schedule(); + finish_wait(&child->offline_waitq, &wait); + cgroup_put(child); + + return restart_syscall(); + } + } + + cgrp->subtree_control = new_sc; + cgrp->child_subsys_mask = new_ss; + + /* * Create new csses or make the existing ones visible. A css is * created invisible if it's being implicitly enabled through * dependency. An invisible css is made visible when the userland @@ -2852,6 +2873,24 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } } + /* + * The effective csses of all the descendants (excluding @cgrp) may + * have changed. Subsystems can optionally subscribe to this event + * by implementing ->css_e_css_changed() which is invoked if any of + * the effective csses seen from the css's cgroup may have changed. + */ + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss); + struct cgroup_subsys_state *css; + + if (!ss->css_e_css_changed || !this_css) + continue; + + css_for_each_descendant_pre(css, this_css) + if (css != this_css) + ss->css_e_css_changed(css); + } + kernfs_activate(cgrp->kn); ret = 0; out_unlock: @@ -2859,9 +2898,8 @@ out_unlock: return ret ?: nbytes; err_undo_css: - cgrp->subtree_control &= ~enable; - cgrp->subtree_control |= disable; - cgroup_refresh_child_subsys_mask(cgrp); + cgrp->subtree_control = old_sc; + cgrp->child_subsys_mask = old_ss; for_each_subsys(ss, ssid) { if (!(enable & (1 << ssid))) @@ -4173,7 +4211,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); if (val) set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); else @@ -4351,6 +4388,7 @@ static void css_free_work_fn(struct work_struct *work) /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); cgroup_pidlist_destroy_all(cgrp); + cancel_work_sync(&cgrp->release_agent_work); if (cgroup_parent(cgrp)) { /* @@ -4397,6 +4435,8 @@ static void css_release_work_fn(struct work_struct *work) if (ss) { /* css release path */ cgroup_idr_remove(&ss->css_idr, css->id); + if (ss->css_released) + ss->css_released(css); } else { /* cgroup release path */ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); @@ -4510,7 +4550,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, init_and_link_css(css, ss, cgrp); - err = percpu_ref_init(&css->refcnt, css_release); + err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); if (err) goto err_free_css; @@ -4583,7 +4623,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_unlock; } - ret = percpu_ref_init(&cgrp->self.refcnt, css_release); + ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out_free_cgrp; @@ -4813,19 +4853,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) for_each_css(css, ssid, cgrp) kill_css(css); - /* CSS_ONLINE is clear, remove from ->release_list for the last time */ - raw_spin_lock(&release_list_lock); - if (!list_empty(&cgrp->release_list)) - list_del_init(&cgrp->release_list); - raw_spin_unlock(&release_list_lock); - /* * Remove @cgrp directory along with the base files. @cgrp has an * extra ref on its kn. */ kernfs_remove(cgrp->kn); - set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags); check_for_release(cgroup_parent(cgrp)); /* put the base reference */ @@ -4842,13 +4875,10 @@ static int cgroup_rmdir(struct kernfs_node *kn) cgrp = cgroup_kn_lock_live(kn); if (!cgrp) return 0; - cgroup_get(cgrp); /* for @kn->priv clearing */ ret = cgroup_destroy_locked(cgrp); cgroup_kn_unlock(kn); - - cgroup_put(cgrp); return ret; } @@ -5052,12 +5082,9 @@ core_initcall(cgroup_wq_init); * - Print task's cgroup paths into seq_file, one line for each hierarchy * - Used for /proc/<pid>/cgroup. */ - -/* TODO: Use a proper seq_file iterator */ -int proc_cgroup_show(struct seq_file *m, void *v) +int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) { - struct pid *pid; - struct task_struct *tsk; char *buf, *path; int retval; struct cgroup_root *root; @@ -5067,14 +5094,6 @@ int proc_cgroup_show(struct seq_file *m, void *v) if (!buf) goto out; - retval = -ESRCH; - pid = m->private; - tsk = get_pid_task(pid, PIDTYPE_PID); - if (!tsk) - goto out_free; - - retval = 0; - mutex_lock(&cgroup_mutex); down_read(&css_set_rwsem); @@ -5104,11 +5123,10 @@ int proc_cgroup_show(struct seq_file *m, void *v) seq_putc(m, '\n'); } + retval = 0; out_unlock: up_read(&css_set_rwsem); mutex_unlock(&cgroup_mutex); - put_task_struct(tsk); -out_free: kfree(buf); out: return retval; @@ -5179,7 +5197,7 @@ void cgroup_post_fork(struct task_struct *child) int i; /* - * This may race against cgroup_enable_task_cg_links(). As that + * This may race against cgroup_enable_task_cg_lists(). As that * function sets use_task_css_set_links before grabbing * tasklist_lock and we just went through tasklist_lock to add * @child, it's guaranteed that either we see the set @@ -5194,7 +5212,7 @@ void cgroup_post_fork(struct task_struct *child) * when implementing operations which need to migrate all tasks of * a cgroup to another. * - * Note that if we lose to cgroup_enable_task_cg_links(), @child + * Note that if we lose to cgroup_enable_task_cg_lists(), @child * will remain in init_css_set. This is safe because all tasks are * in the init_css_set before cg_links is enabled and there's no * operation which transfers all tasks out of init_css_set. @@ -5278,30 +5296,14 @@ void cgroup_exit(struct task_struct *tsk) } if (put_cset) - put_css_set(cset, true); + put_css_set(cset); } static void check_for_release(struct cgroup *cgrp) { - if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && - !css_has_online_children(&cgrp->self)) { - /* - * Control Group is currently removeable. If it's not - * already queued for a userspace notification, queue - * it now - */ - int need_schedule_work = 0; - - raw_spin_lock(&release_list_lock); - if (!cgroup_is_dead(cgrp) && - list_empty(&cgrp->release_list)) { - list_add(&cgrp->release_list, &release_list); - need_schedule_work = 1; - } - raw_spin_unlock(&release_list_lock); - if (need_schedule_work) - schedule_work(&release_agent_work); - } + if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) && + !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) + schedule_work(&cgrp->release_agent_work); } /* @@ -5329,52 +5331,39 @@ static void check_for_release(struct cgroup *cgrp) */ static void cgroup_release_agent(struct work_struct *work) { - BUG_ON(work != &release_agent_work); + struct cgroup *cgrp = + container_of(work, struct cgroup, release_agent_work); + char *pathbuf = NULL, *agentbuf = NULL, *path; + char *argv[3], *envp[3]; + mutex_lock(&cgroup_mutex); - raw_spin_lock(&release_list_lock); - while (!list_empty(&release_list)) { - char *argv[3], *envp[3]; - int i; - char *pathbuf = NULL, *agentbuf = NULL, *path; - struct cgroup *cgrp = list_entry(release_list.next, - struct cgroup, - release_list); - list_del_init(&cgrp->release_list); - raw_spin_unlock(&release_list_lock); - pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!pathbuf) - goto continue_free; - path = cgroup_path(cgrp, pathbuf, PATH_MAX); - if (!path) - goto continue_free; - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!agentbuf) - goto continue_free; - - i = 0; - argv[i++] = agentbuf; - argv[i++] = path; - argv[i] = NULL; - - i = 0; - /* minimal command environment */ - envp[i++] = "HOME=/"; - envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[i] = NULL; - - /* Drop the lock while we invoke the usermode helper, - * since the exec could involve hitting disk and hence - * be a slow process */ - mutex_unlock(&cgroup_mutex); - call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - mutex_lock(&cgroup_mutex); - continue_free: - kfree(pathbuf); - kfree(agentbuf); - raw_spin_lock(&release_list_lock); - } - raw_spin_unlock(&release_list_lock); + + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); + agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); + if (!pathbuf || !agentbuf) + goto out; + + path = cgroup_path(cgrp, pathbuf, PATH_MAX); + if (!path) + goto out; + + argv[0] = agentbuf; + argv[1] = path; + argv[2] = NULL; + + /* minimal command environment */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + mutex_unlock(&cgroup_mutex); + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + goto out_free; +out: + mutex_unlock(&cgroup_mutex); +out_free: + kfree(agentbuf); + kfree(pathbuf); } static int __init cgroup_disable(char *str) @@ -5562,7 +5551,8 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); + return (!cgroup_has_tasks(css->cgroup) && + !css_has_online_children(&css->cgroup->self)); } static struct cftype debug_files[] = { diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config new file mode 100644 index 000000000000..c2de56ab0fce --- /dev/null +++ b/kernel/configs/tiny.config @@ -0,0 +1,4 @@ +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_KERNEL_XZ=y +CONFIG_OPTIMIZE_INLINING=y +CONFIG_SLOB=y diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 5664985c46a0..937ecdfdf258 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -107,46 +107,6 @@ void context_tracking_user_enter(void) } NOKPROBE_SYMBOL(context_tracking_user_enter); -#ifdef CONFIG_PREEMPT -/** - * preempt_schedule_context - preempt_schedule called by tracing - * - * The tracing infrastructure uses preempt_enable_notrace to prevent - * recursion and tracing preempt enabling caused by the tracing - * infrastructure itself. But as tracing can happen in areas coming - * from userspace or just about to enter userspace, a preempt enable - * can occur before user_exit() is called. This will cause the scheduler - * to be called when the system is still in usermode. - * - * To prevent this, the preempt_enable_notrace will use this function - * instead of preempt_schedule() to exit user context if needed before - * calling the scheduler. - */ -asmlinkage __visible void __sched notrace preempt_schedule_context(void) -{ - enum ctx_state prev_ctx; - - if (likely(!preemptible())) - return; - - /* - * Need to disable preemption in case user_exit() is traced - * and the tracer calls preempt_enable_notrace() causing - * an infinite recursion. - */ - preempt_disable_notrace(); - prev_ctx = exception_enter(); - preempt_enable_no_resched_notrace(); - - preempt_schedule(); - - preempt_disable_notrace(); - exception_exit(prev_ctx); - preempt_enable_notrace(); -} -EXPORT_SYMBOL_GPL(preempt_schedule_context); -#endif /* CONFIG_PREEMPT */ - /** * context_tracking_user_exit - Inform the context tracking that the CPU is * exiting userspace mode and entering the kernel. diff --git a/kernel/cpu.c b/kernel/cpu.c index 81e2a388a0f6..5d220234b3ca 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -64,6 +64,8 @@ static struct { * an ongoing cpu hotplug operation. */ int refcount; + /* And allows lockless put_online_cpus(). */ + atomic_t puts_pending; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -79,9 +81,21 @@ static struct { /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) +#define cpuhp_lock_acquire_tryread() \ + lock_map_acquire_tryread(&cpu_hotplug.dep_map) #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) +static void apply_puts_pending(int max) +{ + int delta; + + if (atomic_read(&cpu_hotplug.puts_pending) >= max) { + delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); + cpu_hotplug.refcount -= delta; + } +} + void get_online_cpus(void) { might_sleep(); @@ -89,17 +103,35 @@ void get_online_cpus(void) return; cpuhp_lock_acquire_read(); mutex_lock(&cpu_hotplug.lock); + apply_puts_pending(65536); cpu_hotplug.refcount++; mutex_unlock(&cpu_hotplug.lock); - } EXPORT_SYMBOL_GPL(get_online_cpus); +bool try_get_online_cpus(void) +{ + if (cpu_hotplug.active_writer == current) + return true; + if (!mutex_trylock(&cpu_hotplug.lock)) + return false; + cpuhp_lock_acquire_tryread(); + apply_puts_pending(65536); + cpu_hotplug.refcount++; + mutex_unlock(&cpu_hotplug.lock); + return true; +} +EXPORT_SYMBOL_GPL(try_get_online_cpus); + void put_online_cpus(void) { if (cpu_hotplug.active_writer == current) return; - mutex_lock(&cpu_hotplug.lock); + if (!mutex_trylock(&cpu_hotplug.lock)) { + atomic_inc(&cpu_hotplug.puts_pending); + cpuhp_lock_release(); + return; + } if (WARN_ON(!cpu_hotplug.refcount)) cpu_hotplug.refcount++; /* try to fix things up */ @@ -141,6 +173,7 @@ void cpu_hotplug_begin(void) cpuhp_lock_acquire(); for (;;) { mutex_lock(&cpu_hotplug.lock); + apply_puts_pending(1); if (likely(!cpu_hotplug.refcount)) break; __set_current_state(TASK_UNINTERRUPTIBLE); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 52cb04c993b7..64b257f6bca2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -248,34 +248,34 @@ static struct cpuset top_cpuset = { if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* - * There are two global mutexes guarding cpuset structures - cpuset_mutex - * and callback_mutex. The latter may nest inside the former. We also - * require taking task_lock() when dereferencing a task's cpuset pointer. - * See "The task_lock() exception", at the end of this comment. + * There are two global locks guarding cpuset structures - cpuset_mutex and + * callback_lock. We also require taking task_lock() when dereferencing a + * task's cpuset pointer. See "The task_lock() exception", at the end of this + * comment. * - * A task must hold both mutexes to modify cpusets. If a task holds + * A task must hold both locks to modify cpusets. If a task holds * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it - * is the only task able to also acquire callback_mutex and be able to + * is the only task able to also acquire callback_lock and be able to * modify cpusets. It can perform various checks on the cpuset structure * first, knowing nothing will change. It can also allocate memory while * just holding cpuset_mutex. While it is performing these checks, various - * callback routines can briefly acquire callback_mutex to query cpusets. - * Once it is ready to make the changes, it takes callback_mutex, blocking + * callback routines can briefly acquire callback_lock to query cpusets. + * Once it is ready to make the changes, it takes callback_lock, blocking * everyone else. * * Calls to the kernel memory allocator can not be made while holding - * callback_mutex, as that would risk double tripping on callback_mutex + * callback_lock, as that would risk double tripping on callback_lock * from one of the callbacks into the cpuset code from within * __alloc_pages(). * - * If a task is only holding callback_mutex, then it has read-only + * If a task is only holding callback_lock, then it has read-only * access to cpusets. * * Now, the task_struct fields mems_allowed and mempolicy may be changed * by other task, we use alloc_lock in the task_struct fields to protect * them. * - * The cpuset_common_file_read() handlers only hold callback_mutex across + * The cpuset_common_file_read() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. * @@ -284,7 +284,7 @@ static struct cpuset top_cpuset = { */ static DEFINE_MUTEX(cpuset_mutex); -static DEFINE_MUTEX(callback_mutex); +static DEFINE_SPINLOCK(callback_lock); /* * CPU / memory hotplug is handled asynchronously. @@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = { * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * - * Call with callback_mutex held. + * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { @@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * - * Call with callback_mutex held. + * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { @@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Called with callback_mutex/cpuset_mutex held + * Call with callback_lock or cpuset_mutex held. */ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) @@ -506,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) goto out; } + /* + * We can't shrink if we won't have enough room for SCHED_DEADLINE + * tasks. + */ + ret = -EBUSY; + if (is_cpu_exclusive(cur) && + !cpuset_cpumask_can_shrink(cur->cpus_allowed, + trial->cpus_allowed)) + goto out; + ret = 0; out: rcu_read_unlock(); @@ -876,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) continue; rcu_read_unlock(); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, new_cpus); - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); @@ -943,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); /* use trialcs->cpus_allowed as a temp variable */ update_cpumasks_hier(cs, trialcs->cpus_allowed); @@ -1132,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) continue; rcu_read_unlock(); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cp->effective_mems = *new_mems; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && !nodes_equal(cp->mems_allowed, cp->effective_mems)); @@ -1155,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cpuset_mutex held. May take callback_mutex during call. + * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1202,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) goto done; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cs->mems_allowed = trialcs->mems_allowed; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); /* use trialcs->mems_allowed as a temp variable */ update_nodemasks_hier(cs, &cs->mems_allowed); @@ -1295,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) || (is_spread_page(cs) != is_spread_page(trialcs))); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cs->flags = trialcs->flags; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) rebuild_sched_domains_locked(); @@ -1429,17 +1439,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, goto out_unlock; cgroup_taskset_for_each(task, tset) { - /* - * Kthreads which disallow setaffinity shouldn't be moved - * to a new cpuset; we don't want to change their cpu - * affinity and isolating such threads by their set of - * allowed nodes is unnecessary. Thus, cpusets are not - * applicable for such threads. This prevents checking for - * success of set_cpus_allowed_ptr() on all attached tasks - * before cpus_allowed may be changed. - */ - ret = -EINVAL; - if (task->flags & PF_NO_SETAFFINITY) + ret = task_can_attach(task, cs->cpus_allowed); + if (ret) goto out_unlock; ret = security_task_setscheduler(task); if (ret) @@ -1713,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) count = seq_get_buf(sf, &buf); s = buf; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); switch (type) { case FILE_CPULIST: @@ -1740,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) seq_commit(sf, -1); } out_unlock: - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); return ret; } @@ -1957,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); if (cgroup_on_dfl(cs->css.cgroup)) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; @@ -1989,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) } rcu_read_unlock(); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cs->mems_allowed = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); return 0; @@ -2031,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) static void cpuset_bind(struct cgroup_subsys_state *root_css) { mutex_lock(&cpuset_mutex); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); if (cgroup_on_dfl(root_css->cgroup)) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); @@ -2042,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) top_cpuset.mems_allowed = top_cpuset.effective_mems; } - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); mutex_unlock(&cpuset_mutex); } @@ -2127,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs, { bool is_empty; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, new_cpus); cpumask_copy(cs->effective_cpus, new_cpus); cs->mems_allowed = *new_mems; cs->effective_mems = *new_mems; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); /* * Don't call update_tasks_cpumask() if the cpuset becomes empty, @@ -2169,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs, if (nodes_empty(*new_mems)) *new_mems = parent_cs(cs)->effective_mems; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cpumask_copy(cs->effective_cpus, new_cpus); cs->effective_mems = *new_mems; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); if (cpus_updated) update_tasks_cpumask(cs); @@ -2258,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); cpumask_copy(top_cpuset.effective_cpus, &new_cpus); - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); /* we don't mess with cpumasks of tasks in top_cpuset */ } /* synchronize mems_allowed to N_MEMORY */ if (mems_updated) { - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); if (!on_dfl) top_cpuset.mems_allowed = new_mems; top_cpuset.effective_mems = new_mems; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); update_tasks_nodemask(&top_cpuset); } @@ -2365,11 +2366,13 @@ void __init cpuset_init_smp(void) void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { - mutex_lock(&callback_mutex); + unsigned long flags; + + spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); guarantee_online_cpus(task_cs(tsk), pmask); rcu_read_unlock(); - mutex_unlock(&callback_mutex); + spin_unlock_irqrestore(&callback_lock, flags); } void cpuset_cpus_allowed_fallback(struct task_struct *tsk) @@ -2415,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void) nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { nodemask_t mask; + unsigned long flags; - mutex_lock(&callback_mutex); + spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); guarantee_online_mems(task_cs(tsk), &mask); rcu_read_unlock(); - mutex_unlock(&callback_mutex); + spin_unlock_irqrestore(&callback_lock, flags); return mask; } @@ -2439,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) /* * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or * mem_hardwall ancestor to the specified cpuset. Call holding - * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall + * callback_lock. If no ancestor is mem_exclusive or mem_hardwall * (an unusual configuration), then returns the root cpuset. */ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) @@ -2450,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) } /** - * cpuset_node_allowed_softwall - Can we allocate on a memory node? + * cpuset_node_allowed - Can we allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -2462,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * flag, yes. * Otherwise, no. * - * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to - * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall() - * might sleep, and might allow a node from an enclosing cpuset. - * - * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall - * cpusets, and never sleeps. - * * The __GFP_THISNODE placement logic is really handled elsewhere, * by forcibly using a zonelist starting at a specified node, and by * (in get_page_from_freelist()) refusing to consider the zones for @@ -2481,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * - * Scanning up parent cpusets requires callback_mutex. The + * Scanning up parent cpusets requires callback_lock. The * __alloc_pages() routine only calls here with __GFP_HARDWALL bit * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the * current tasks mems_allowed came up empty on the first pass over * the zonelist. So only GFP_KERNEL allocations, if all nodes in the - * cpuset are short of memory, might require taking the callback_mutex - * mutex. + * cpuset are short of memory, might require taking the callback_lock. * * The first call here from mm/page_alloc:get_page_from_freelist() * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, @@ -2504,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * TIF_MEMDIE - any node ok * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. - * - * Rule: - * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you - * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables - * the code that might scan up ancestor cpusets and sleep. */ -int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) +int __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ + unsigned long flags; if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) return 1; - might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); if (node_isset(node, current->mems_allowed)) return 1; /* @@ -2533,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) return 1; /* Not hardwall and node outside mems_allowed: scan up cpusets */ - mutex_lock(&callback_mutex); + spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); rcu_read_unlock(); - mutex_unlock(&callback_mutex); + spin_unlock_irqrestore(&callback_lock, flags); return allowed; } -/* - * cpuset_node_allowed_hardwall - Can we allocate on a memory node? - * @node: is this an allowed node? - * @gfp_mask: memory allocation flags - * - * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is - * set, yes, we can always allocate. If node is in our task's mems_allowed, - * yes. If the task has been OOM killed and has access to memory reserves as - * specified by the TIF_MEMDIE flag, yes. - * Otherwise, no. - * - * The __GFP_THISNODE placement logic is really handled elsewhere, - * by forcibly using a zonelist starting at a specified node, and by - * (in get_page_from_freelist()) refusing to consider the zones for - * any node on the zonelist except the first. By the time any such - * calls get to this routine, we should just shut up and say 'yes'. - * - * Unlike the cpuset_node_allowed_softwall() variant, above, - * this variant requires that the node be in the current task's - * mems_allowed or that we're in interrupt. It does not scan up the - * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. - * It never sleeps. - */ -int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) -{ - if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) - return 1; - if (node_isset(node, current->mems_allowed)) - return 1; - /* - * Allow tasks that have access to memory reserves because they have - * been OOM killed to get memory anywhere. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) - return 1; - return 0; -} - /** * cpuset_mem_spread_node() - On which node to begin search for a file page * cpuset_slab_spread_node() - On which node to begin search for a slab page @@ -2730,10 +2683,9 @@ void __cpuset_memory_pressure_bump(void) * and we take cpuset_mutex, keeping cpuset_attach() from changing it * anyway. */ -int proc_cpuset_show(struct seq_file *m, void *unused_v) +int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) { - struct pid *pid; - struct task_struct *tsk; char *buf, *p; struct cgroup_subsys_state *css; int retval; @@ -2743,24 +2695,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) if (!buf) goto out; - retval = -ESRCH; - pid = m->private; - tsk = get_pid_task(pid, PIDTYPE_PID); - if (!tsk) - goto out_free; - retval = -ENAMETOOLONG; rcu_read_lock(); css = task_css(tsk, cpuset_cgrp_id); p = cgroup_path(css->cgroup, buf, PATH_MAX); rcu_read_unlock(); if (!p) - goto out_put_task; + goto out_free; seq_puts(m, p); seq_putc(m, '\n'); retval = 0; -out_put_task: - put_task_struct(tsk); out_free: kfree(buf); out: diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index c766ee54c0b1..b64e238b553b 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -18,6 +18,7 @@ unsigned long saved_max_pfn; * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. */ unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; +EXPORT_SYMBOL_GPL(elfcorehdr_addr); /* * stores the size of elf header of crash image diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 1adf62b39b96..07ce18ca71e0 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -27,6 +27,9 @@ * version 2. This program is licensed "as is" without any warranty of any * kind, whether express or implied. */ + +#define pr_fmt(fmt) "KGDB: " fmt + #include <linux/pid_namespace.h> #include <linux/clocksource.h> #include <linux/serial_core.h> @@ -196,8 +199,8 @@ int __weak kgdb_validate_break_address(unsigned long addr) return err; err = kgdb_arch_remove_breakpoint(&tmp); if (err) - printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " - "memory destroyed at: %lx", addr); + pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n", + addr); return err; } @@ -256,8 +259,8 @@ int dbg_activate_sw_breakpoints(void) error = kgdb_arch_set_breakpoint(&kgdb_break[i]); if (error) { ret = error; - printk(KERN_INFO "KGDB: BP install failed: %lx", - kgdb_break[i].bpt_addr); + pr_info("BP install failed: %lx\n", + kgdb_break[i].bpt_addr); continue; } @@ -319,8 +322,8 @@ int dbg_deactivate_sw_breakpoints(void) continue; error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); if (error) { - printk(KERN_INFO "KGDB: BP remove failed: %lx\n", - kgdb_break[i].bpt_addr); + pr_info("BP remove failed: %lx\n", + kgdb_break[i].bpt_addr); ret = error; } @@ -367,7 +370,7 @@ int dbg_remove_all_break(void) goto setundefined; error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); if (error) - printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", + pr_err("breakpoint remove failed: %lx\n", kgdb_break[i].bpt_addr); setundefined: kgdb_break[i].state = BP_UNDEFINED; @@ -400,9 +403,9 @@ static int kgdb_io_ready(int print_wait) if (print_wait) { #ifdef CONFIG_KGDB_KDB if (!dbg_kdb_mode) - printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n"); + pr_crit("waiting... or $3#33 for KDB\n"); #else - printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); + pr_crit("Waiting for remote debugger\n"); #endif } return 1; @@ -430,8 +433,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) exception_level = 0; kgdb_skipexception(ks->ex_vector, ks->linux_regs); dbg_activate_sw_breakpoints(); - printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", - addr); + pr_crit("re-enter error: breakpoint removed %lx\n", addr); WARN_ON_ONCE(1); return 1; @@ -444,7 +446,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) panic("Recursive entry to debugger"); } - printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); + pr_crit("re-enter exception: ALL breakpoints killed\n"); #ifdef CONFIG_KGDB_KDB /* Allow kdb to debug itself one level */ return 0; @@ -471,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, int cpu; int trace_on = 0; int online_cpus = num_online_cpus(); + u64 time_left; kgdb_info[ks->cpu].enter_kgdb++; kgdb_info[ks->cpu].exception_state |= exception_state; @@ -595,9 +598,13 @@ return_normal: /* * Wait for the other CPUs to be notified and be waiting for us: */ - while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + - atomic_read(&slaves_in_kgdb)) != online_cpus) + time_left = loops_per_jiffy * HZ; + while (kgdb_do_roundup && --time_left && + (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != + online_cpus) cpu_relax(); + if (!time_left) + pr_crit("KGDB: Timed out waiting for secondary CPUs.\n"); /* * At this point the primary processor is completely @@ -795,15 +802,15 @@ static struct console kgdbcons = { static void sysrq_handle_dbg(int key) { if (!dbg_io_ops) { - printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); + pr_crit("ERROR: No KGDB I/O module available\n"); return; } if (!kgdb_connected) { #ifdef CONFIG_KGDB_KDB if (!dbg_kdb_mode) - printk(KERN_CRIT "KGDB or $3#33 for KDB\n"); + pr_crit("KGDB or $3#33 for KDB\n"); #else - printk(KERN_CRIT "Entering KGDB\n"); + pr_crit("Entering KGDB\n"); #endif } @@ -945,7 +952,7 @@ static void kgdb_initial_breakpoint(void) { kgdb_break_asap = 0; - printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); + pr_crit("Waiting for connection from remote gdb...\n"); kgdb_breakpoint(); } @@ -964,8 +971,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) if (dbg_io_ops) { spin_unlock(&kgdb_registration_lock); - printk(KERN_ERR "kgdb: Another I/O driver is already " - "registered with KGDB.\n"); + pr_err("Another I/O driver is already registered with KGDB\n"); return -EBUSY; } @@ -981,8 +987,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) spin_unlock(&kgdb_registration_lock); - printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", - new_dbg_io_ops->name); + pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name); /* Arm KGDB now. */ kgdb_register_callbacks(); @@ -1017,8 +1022,7 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops) spin_unlock(&kgdb_registration_lock); - printk(KERN_INFO - "kgdb: Unregistered I/O driver %s, debugger disabled.\n", + pr_info("Unregistered I/O driver %s, debugger disabled\n", old_dbg_io_ops->name); } EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 70a504601dc3..e1dbf4a2c69e 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -52,11 +52,11 @@ static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp) bp->bph_length = 1; if ((argc + 1) != nextarg) { - if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0) + if (strncasecmp(argv[nextarg], "datar", sizeof("datar")) == 0) bp->bp_type = BP_ACCESS_WATCHPOINT; - else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) + else if (strncasecmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) bp->bp_type = BP_WRITE_WATCHPOINT; - else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0) + else if (strncasecmp(argv[nextarg], "inst", sizeof("inst")) == 0) bp->bp_type = BP_HARDWARE_BREAKPOINT; else return KDB_ARGCOUNT; @@ -531,22 +531,29 @@ void __init kdb_initbptab(void) for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) bp->bp_free = 1; - kdb_register_repeat("bp", kdb_bp, "[<vaddr>]", - "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("bl", kdb_bp, "[<vaddr>]", - "Display breakpoints", 0, KDB_REPEAT_NO_ARGS); + kdb_register_flags("bp", kdb_bp, "[<vaddr>]", + "Set/Display breakpoints", 0, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); + kdb_register_flags("bl", kdb_bp, "[<vaddr>]", + "Display breakpoints", 0, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) - kdb_register_repeat("bph", kdb_bp, "[<vaddr>]", - "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("bc", kdb_bc, "<bpnum>", - "Clear Breakpoint", 0, KDB_REPEAT_NONE); - kdb_register_repeat("be", kdb_bc, "<bpnum>", - "Enable Breakpoint", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bd", kdb_bc, "<bpnum>", - "Disable Breakpoint", 0, KDB_REPEAT_NONE); - - kdb_register_repeat("ss", kdb_ss, "", - "Single Step", 1, KDB_REPEAT_NO_ARGS); + kdb_register_flags("bph", kdb_bp, "[<vaddr>]", + "[datar [length]|dataw [length]] Set hw brk", 0, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); + kdb_register_flags("bc", kdb_bc, "<bpnum>", + "Clear Breakpoint", 0, + KDB_ENABLE_FLOW_CTRL); + kdb_register_flags("be", kdb_bc, "<bpnum>", + "Enable Breakpoint", 0, + KDB_ENABLE_FLOW_CTRL); + kdb_register_flags("bd", kdb_bc, "<bpnum>", + "Disable Breakpoint", 0, + KDB_ENABLE_FLOW_CTRL); + + kdb_register_flags("ss", kdb_ss, "", + "Single Step", 1, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); /* * Architecture dependent initialization. */ diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 8859ca34dcfe..15e1a7af5dd0 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks) ks->pass_exception = 1; KDB_FLAG_SET(CATASTROPHIC); } + /* set CATASTROPHIC if the system contains unresponsive processors */ + for_each_online_cpu(i) + if (!kgdb_info[i].enter_kgdb) + KDB_FLAG_SET(CATASTROPHIC); if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { KDB_STATE_CLEAR(SSBPT); KDB_STATE_CLEAR(DOING_SS); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 379650b984f8..f191bddf64b8 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -12,6 +12,7 @@ */ #include <linux/ctype.h> +#include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/kmsg_dump.h> @@ -23,6 +24,7 @@ #include <linux/vmalloc.h> #include <linux/atomic.h> #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/kallsyms.h> @@ -42,6 +44,12 @@ #include <linux/slab.h> #include "kdb_private.h" +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "kdb." + +static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE; +module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600); + #define GREP_LEN 256 char kdb_grep_string[GREP_LEN]; int kdb_grepping_flag; @@ -121,6 +129,7 @@ static kdbmsg_t kdbmsgs[] = { KDBMSG(BADLENGTH, "Invalid length field"), KDBMSG(NOBP, "No Breakpoint exists"), KDBMSG(BADADDR, "Invalid address"), + KDBMSG(NOPERM, "Permission denied"), }; #undef KDBMSG @@ -188,6 +197,26 @@ struct task_struct *kdb_curr_task(int cpu) } /* + * Check whether the flags of the current command and the permissions + * of the kdb console has allow a command to be run. + */ +static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions, + bool no_args) +{ + /* permissions comes from userspace so needs massaging slightly */ + permissions &= KDB_ENABLE_MASK; + permissions |= KDB_ENABLE_ALWAYS_SAFE; + + /* some commands change group when launched with no arguments */ + if (no_args) + permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT; + + flags |= KDB_ENABLE_ALL; + + return permissions & flags; +} + +/* * kdbgetenv - This function will return the character string value of * an environment variable. * Parameters: @@ -476,6 +505,15 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg, kdb_symtab_t symtab; /* + * If the enable flags prohibit both arbitrary memory access + * and flow control then there are no reasonable grounds to + * provide symbol lookup. + */ + if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL, + kdb_cmd_enabled, false)) + return KDB_NOPERM; + + /* * Process arguments which follow the following syntax: * * symbol | numeric-address [+/- numeric-offset] @@ -641,8 +679,13 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) if (!s->count) s->usable = 0; if (s->usable) - kdb_register(s->name, kdb_exec_defcmd, - s->usage, s->help, 0); + /* macros are always safe because when executed each + * internal command re-enters kdb_parse() and is + * safety checked individually. + */ + kdb_register_flags(s->name, kdb_exec_defcmd, s->usage, + s->help, 0, + KDB_ENABLE_ALWAYS_SAFE); return 0; } if (!s->usable) @@ -1003,25 +1046,22 @@ int kdb_parse(const char *cmdstr) if (i < kdb_max_commands) { int result; + + if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1)) + return KDB_NOPERM; + KDB_STATE_SET(CMD); result = (*tp->cmd_func)(argc-1, (const char **)argv); if (result && ignore_errors && result > KDB_CMD_GO) result = 0; KDB_STATE_CLEAR(CMD); - switch (tp->cmd_repeat) { - case KDB_REPEAT_NONE: - argc = 0; - if (argv[0]) - *(argv[0]) = '\0'; - break; - case KDB_REPEAT_NO_ARGS: - argc = 1; - if (argv[1]) - *(argv[1]) = '\0'; - break; - case KDB_REPEAT_WITH_ARGS: - break; - } + + if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS) + return result; + + argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0; + if (argv[argc]) + *(argv[argc]) = '\0'; return result; } @@ -1921,10 +1961,14 @@ static int kdb_rm(int argc, const char **argv) */ static int kdb_sr(int argc, const char **argv) { + bool check_mask = + !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false); + if (argc != 1) return KDB_ARGCOUNT; + kdb_trap_printk++; - __handle_sysrq(*argv[1], false); + __handle_sysrq(*argv[1], check_mask); kdb_trap_printk--; return 0; @@ -2157,6 +2201,8 @@ static void kdb_cpu_status(void) for (start_cpu = -1, i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) { state = 'F'; /* cpu is offline */ + } else if (!kgdb_info[i].enter_kgdb) { + state = 'D'; /* cpu is online but unresponsive */ } else { state = ' '; /* cpu is responding to kdb */ if (kdb_task_state_char(KDB_TSK(i)) == 'I') @@ -2210,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv) /* * Validate cpunum */ - if ((cpunum > NR_CPUS) || !cpu_online(cpunum)) + if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb) return KDB_BADCPUNUM; dbg_switch_cpu = cpunum; @@ -2375,6 +2421,8 @@ static int kdb_help(int argc, const char **argv) return 0; if (!kt->cmd_name) continue; + if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true)) + continue; if (strlen(kt->cmd_usage) > 20) space = "\n "; kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, @@ -2629,7 +2677,7 @@ static int kdb_grep_help(int argc, const char **argv) } /* - * kdb_register_repeat - This function is used to register a kernel + * kdb_register_flags - This function is used to register a kernel * debugger command. * Inputs: * cmd Command name @@ -2641,12 +2689,12 @@ static int kdb_grep_help(int argc, const char **argv) * zero for success, one if a duplicate command. */ #define kdb_command_extend 50 /* arbitrary */ -int kdb_register_repeat(char *cmd, - kdb_func_t func, - char *usage, - char *help, - short minlen, - kdb_repeat_t repeat) +int kdb_register_flags(char *cmd, + kdb_func_t func, + char *usage, + char *help, + short minlen, + kdb_cmdflags_t flags) { int i; kdbtab_t *kp; @@ -2694,19 +2742,18 @@ int kdb_register_repeat(char *cmd, kp->cmd_func = func; kp->cmd_usage = usage; kp->cmd_help = help; - kp->cmd_flags = 0; kp->cmd_minlen = minlen; - kp->cmd_repeat = repeat; + kp->cmd_flags = flags; return 0; } -EXPORT_SYMBOL_GPL(kdb_register_repeat); +EXPORT_SYMBOL_GPL(kdb_register_flags); /* * kdb_register - Compatibility register function for commands that do * not need to specify a repeat state. Equivalent to - * kdb_register_repeat with KDB_REPEAT_NONE. + * kdb_register_flags with flags set to 0. * Inputs: * cmd Command name * func Function to execute the command @@ -2721,8 +2768,7 @@ int kdb_register(char *cmd, char *help, short minlen) { - return kdb_register_repeat(cmd, func, usage, help, minlen, - KDB_REPEAT_NONE); + return kdb_register_flags(cmd, func, usage, help, minlen, 0); } EXPORT_SYMBOL_GPL(kdb_register); @@ -2764,80 +2810,109 @@ static void __init kdb_inittab(void) for_each_kdbcmd(kp, i) kp->cmd_name = NULL; - kdb_register_repeat("md", kdb_md, "<vaddr>", + kdb_register_flags("md", kdb_md, "<vaddr>", "Display Memory Contents, also mdWcN, e.g. md8c1", 1, - KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>", - "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>", - "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mds", kdb_md, "<vaddr>", - "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>", - "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("go", kdb_go, "[<vaddr>]", - "Continue Execution", 1, KDB_REPEAT_NONE); - kdb_register_repeat("rd", kdb_rd, "", - "Display Registers", 0, KDB_REPEAT_NONE); - kdb_register_repeat("rm", kdb_rm, "<reg> <contents>", - "Modify Registers", 0, KDB_REPEAT_NONE); - kdb_register_repeat("ef", kdb_ef, "<vaddr>", - "Display exception frame", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bt", kdb_bt, "[<vaddr>]", - "Stack traceback", 1, KDB_REPEAT_NONE); - kdb_register_repeat("btp", kdb_bt, "<pid>", - "Display stack for process <pid>", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", - "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE); - kdb_register_repeat("btc", kdb_bt, "", - "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); - kdb_register_repeat("btt", kdb_bt, "<vaddr>", + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>", + "Display Raw Memory", 0, + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>", + "Display Physical Memory", 0, + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mds", kdb_md, "<vaddr>", + "Display Memory Symbolically", 0, + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>", + "Modify Memory Contents", 0, + KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS); + kdb_register_flags("go", kdb_go, "[<vaddr>]", + "Continue Execution", 1, + KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS); + kdb_register_flags("rd", kdb_rd, "", + "Display Registers", 0, + KDB_ENABLE_REG_READ); + kdb_register_flags("rm", kdb_rm, "<reg> <contents>", + "Modify Registers", 0, + KDB_ENABLE_REG_WRITE); + kdb_register_flags("ef", kdb_ef, "<vaddr>", + "Display exception frame", 0, + KDB_ENABLE_MEM_READ); + kdb_register_flags("bt", kdb_bt, "[<vaddr>]", + "Stack traceback", 1, + KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS); + kdb_register_flags("btp", kdb_bt, "<pid>", + "Display stack for process <pid>", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", + "Backtrace all processes matching state flag", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("btc", kdb_bt, "", + "Backtrace current process on each cpu", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("btt", kdb_bt, "<vaddr>", "Backtrace process given its struct task address", 0, - KDB_REPEAT_NONE); - kdb_register_repeat("env", kdb_env, "", - "Show environment variables", 0, KDB_REPEAT_NONE); - kdb_register_repeat("set", kdb_set, "", - "Set environment variables", 0, KDB_REPEAT_NONE); - kdb_register_repeat("help", kdb_help, "", - "Display Help Message", 1, KDB_REPEAT_NONE); - kdb_register_repeat("?", kdb_help, "", - "Display Help Message", 0, KDB_REPEAT_NONE); - kdb_register_repeat("cpu", kdb_cpu, "<cpunum>", - "Switch to new cpu", 0, KDB_REPEAT_NONE); - kdb_register_repeat("kgdb", kdb_kgdb, "", - "Enter kgdb mode", 0, KDB_REPEAT_NONE); - kdb_register_repeat("ps", kdb_ps, "[<flags>|A]", - "Display active task list", 0, KDB_REPEAT_NONE); - kdb_register_repeat("pid", kdb_pid, "<pidnum>", - "Switch to another task", 0, KDB_REPEAT_NONE); - kdb_register_repeat("reboot", kdb_reboot, "", - "Reboot the machine immediately", 0, KDB_REPEAT_NONE); + KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS); + kdb_register_flags("env", kdb_env, "", + "Show environment variables", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("set", kdb_set, "", + "Set environment variables", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("help", kdb_help, "", + "Display Help Message", 1, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("?", kdb_help, "", + "Display Help Message", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("cpu", kdb_cpu, "<cpunum>", + "Switch to new cpu", 0, + KDB_ENABLE_ALWAYS_SAFE_NO_ARGS); + kdb_register_flags("kgdb", kdb_kgdb, "", + "Enter kgdb mode", 0, 0); + kdb_register_flags("ps", kdb_ps, "[<flags>|A]", + "Display active task list", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("pid", kdb_pid, "<pidnum>", + "Switch to another task", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("reboot", kdb_reboot, "", + "Reboot the machine immediately", 0, + KDB_ENABLE_REBOOT); #if defined(CONFIG_MODULES) - kdb_register_repeat("lsmod", kdb_lsmod, "", - "List loaded kernel modules", 0, KDB_REPEAT_NONE); + kdb_register_flags("lsmod", kdb_lsmod, "", + "List loaded kernel modules", 0, + KDB_ENABLE_INSPECT); #endif #if defined(CONFIG_MAGIC_SYSRQ) - kdb_register_repeat("sr", kdb_sr, "<key>", - "Magic SysRq key", 0, KDB_REPEAT_NONE); + kdb_register_flags("sr", kdb_sr, "<key>", + "Magic SysRq key", 0, + KDB_ENABLE_ALWAYS_SAFE); #endif #if defined(CONFIG_PRINTK) - kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", - "Display syslog buffer", 0, KDB_REPEAT_NONE); + kdb_register_flags("dmesg", kdb_dmesg, "[lines]", + "Display syslog buffer", 0, + KDB_ENABLE_ALWAYS_SAFE); #endif if (arch_kgdb_ops.enable_nmi) { - kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", - "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); - } - kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", - "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); - kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", - "Send a signal to a process", 0, KDB_REPEAT_NONE); - kdb_register_repeat("summary", kdb_summary, "", - "Summarize the system", 4, KDB_REPEAT_NONE); - kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", - "Display per_cpu variables", 3, KDB_REPEAT_NONE); - kdb_register_repeat("grephelp", kdb_grep_help, "", - "Display help on | grep", 0, KDB_REPEAT_NONE); + kdb_register_flags("disable_nmi", kdb_disable_nmi, "", + "Disable NMI entry to KDB", 0, + KDB_ENABLE_ALWAYS_SAFE); + } + kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"", + "Define a set of commands, down to endefcmd", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("kill", kdb_kill, "<-signal> <pid>", + "Send a signal to a process", 0, + KDB_ENABLE_SIGNAL); + kdb_register_flags("summary", kdb_summary, "", + "Summarize the system", 4, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", + "Display per_cpu variables", 3, + KDB_ENABLE_MEM_READ); + kdb_register_flags("grephelp", kdb_grep_help, "", + "Display help on | grep", 0, + KDB_ENABLE_ALWAYS_SAFE); } /* Execute any commands defined in kdb_cmds. */ diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 7afd3c8c41d5..eaacd1693954 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -172,10 +172,9 @@ typedef struct _kdbtab { kdb_func_t cmd_func; /* Function to execute command */ char *cmd_usage; /* Usage String for this command */ char *cmd_help; /* Help message for this command */ - short cmd_flags; /* Parsing flags */ short cmd_minlen; /* Minimum legal # command * chars required */ - kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */ + kdb_cmdflags_t cmd_flags; /* Command behaviour flags */ } kdbtab_t; extern int kdb_bt(int, const char **); /* KDB display back trace */ diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 97b67df8fbfe..d659487254d5 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -52,7 +52,7 @@ static void release_callchain_buffers(void) struct callchain_cpus_entries *entries; entries = callchain_cpus_entries; - rcu_assign_pointer(callchain_cpus_entries, NULL); + RCU_INIT_POINTER(callchain_cpus_entries, NULL); call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); } @@ -137,7 +137,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) int cpu; struct callchain_cpus_entries *entries; - *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion)); if (*rctx == -1) return NULL; @@ -153,7 +153,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) static void put_callchain_entry(int rctx) { - put_recursion_context(__get_cpu_var(callchain_recursion), rctx); + put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); } struct perf_callchain_entry * diff --git a/kernel/events/core.c b/kernel/events/core.c index 963bf139e2b2..882f835a0d85 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -47,6 +47,8 @@ #include <asm/irq_regs.h> +static struct workqueue_struct *perf_wq; + struct remote_function_call { struct task_struct *p; int (*func)(void *info); @@ -120,6 +122,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) return data.ret; } +#define EVENT_OWNER_KERNEL ((void *) -1) + +static bool is_kernel_event(struct perf_event *event) +{ + return event->owner == EVENT_OWNER_KERNEL; +} + #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ @@ -240,7 +249,7 @@ static void perf_duration_warn(struct irq_work *w) u64 avg_local_sample_len; u64 local_samples_len; - local_samples_len = __get_cpu_var(running_sample_length); + local_samples_len = __this_cpu_read(running_sample_length); avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; printk_ratelimited(KERN_WARNING @@ -262,10 +271,10 @@ void perf_sample_event_took(u64 sample_len_ns) return; /* decay the counter by 1 average sample */ - local_samples_len = __get_cpu_var(running_sample_length); + local_samples_len = __this_cpu_read(running_sample_length); local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; local_samples_len += sample_len_ns; - __get_cpu_var(running_sample_length) = local_samples_len; + __this_cpu_write(running_sample_length, local_samples_len); /* * note: this will be biased artifically low until we have @@ -392,14 +401,9 @@ perf_cgroup_match(struct perf_event *event) event->cgrp->css.cgroup); } -static inline void perf_put_cgroup(struct perf_event *event) -{ - css_put(&event->cgrp->css); -} - static inline void perf_detach_cgroup(struct perf_event *event) { - perf_put_cgroup(event); + css_put(&event->cgrp->css); event->cgrp = NULL; } @@ -610,7 +614,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; - css = css_tryget_online_from_dir(f.file->f_dentry, + css = css_tryget_online_from_dir(f.file->f_path.dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); @@ -878,7 +882,7 @@ static DEFINE_PER_CPU(struct list_head, rotation_list); static void perf_pmu_rotate_start(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - struct list_head *head = &__get_cpu_var(rotation_list); + struct list_head *head = this_cpu_ptr(&rotation_list); WARN_ON(!irqs_disabled()); @@ -902,13 +906,23 @@ static void put_ctx(struct perf_event_context *ctx) } } -static void unclone_ctx(struct perf_event_context *ctx) +/* + * This must be done under the ctx->lock, such as to serialize against + * context_equiv(), therefore we cannot call put_ctx() since that might end up + * calling scheduler related locks and ctx->lock nests inside those. + */ +static __must_check struct perf_event_context * +unclone_ctx(struct perf_event_context *ctx) { - if (ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); + struct perf_event_context *parent_ctx = ctx->parent_ctx; + + lockdep_assert_held(&ctx->lock); + + if (parent_ctx) ctx->parent_ctx = NULL; - } ctx->generation++; + + return parent_ctx; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) @@ -1375,6 +1389,45 @@ out: perf_event__header_size(tmp); } +/* + * User event without the task. + */ +static bool is_orphaned_event(struct perf_event *event) +{ + return event && !is_kernel_event(event) && !event->owner; +} + +/* + * Event has a parent but parent's task finished and it's + * alive only because of children holding refference. + */ +static bool is_orphaned_child(struct perf_event *event) +{ + return is_orphaned_event(event->parent); +} + +static void orphans_remove_work(struct work_struct *work); + +static void schedule_orphans_remove(struct perf_event_context *ctx) +{ + if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) + return; + + if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { + get_ctx(ctx); + ctx->orphans_remove_sched = true; + } +} + +static int __init perf_workqueue_init(void) +{ + perf_wq = create_singlethread_workqueue("perf"); + WARN(!perf_wq, "failed to create perf workqueue\n"); + return perf_wq ? 0 : -1; +} + +core_initcall(perf_workqueue_init); + static inline int event_filter_match(struct perf_event *event) { @@ -1424,6 +1477,9 @@ event_sched_out(struct perf_event *event, if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; + if (is_orphaned_child(event)) + schedule_orphans_remove(ctx); + perf_pmu_enable(event->pmu); } @@ -1506,8 +1562,10 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group if (!task) { /* - * Per cpu events are removed via an smp call and - * the removal is always successful. + * Per cpu events are removed via an smp call. The removal can + * fail if the CPU is currently offline, but in that case we + * already called __perf_remove_from_context from + * perf_event_exit_cpu. */ cpu_function_call(event->cpu, __perf_remove_from_context, &re); return; @@ -1731,6 +1789,9 @@ event_sched_in(struct perf_event *event, if (event->attr.exclusive) cpuctx->exclusive = 1; + if (is_orphaned_child(event)) + schedule_orphans_remove(ctx); + out: perf_pmu_enable(event->pmu); @@ -2210,6 +2271,9 @@ static void ctx_sched_out(struct perf_event_context *ctx, static int context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2) { + lockdep_assert_held(&ctx1->lock); + lockdep_assert_held(&ctx2->lock); + /* Pinning disables the swap optimization */ if (ctx1->pin_count || ctx2->pin_count) return 0; @@ -2331,7 +2395,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, next_parent = rcu_dereference(next_ctx->parent_ctx); /* If neither context have a parent context; they cannot be clones. */ - if (!parent || !next_parent) + if (!parent && !next_parent) goto unlock; if (next_parent == ctx || next_ctx == parent || next_parent == parent) { @@ -2400,7 +2464,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ - if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_out(task, next); } @@ -2643,11 +2707,11 @@ void __perf_event_task_sched_in(struct task_struct *prev, * to check if we have to switch in PMU state. * cgroup event are system-wide mode only */ - if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_in(prev, task); /* check for system-wide branch_stack events */ - if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) + if (atomic_read(this_cpu_ptr(&perf_branch_stack_events))) perf_branch_stack_sched_in(prev, task); } @@ -2902,7 +2966,7 @@ bool perf_event_can_stop_tick(void) void perf_event_task_tick(void) { - struct list_head *head = &__get_cpu_var(rotation_list); + struct list_head *head = this_cpu_ptr(&rotation_list); struct perf_cpu_context *cpuctx, *tmp; struct perf_event_context *ctx; int throttled; @@ -2943,6 +3007,7 @@ static int event_enable_on_exec(struct perf_event *event, */ static void perf_event_enable_on_exec(struct perf_event_context *ctx) { + struct perf_event_context *clone_ctx = NULL; struct perf_event *event; unsigned long flags; int enabled = 0; @@ -2974,7 +3039,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) * Unclone this context if we enabled any event. */ if (enabled) - unclone_ctx(ctx); + clone_ctx = unclone_ctx(ctx); raw_spin_unlock(&ctx->lock); @@ -2984,6 +3049,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); + + if (clone_ctx) + put_ctx(clone_ctx); } void perf_event_exec(void) @@ -3078,6 +3146,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx) INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); + INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); } static struct perf_event_context * @@ -3135,7 +3204,7 @@ errout: static struct perf_event_context * find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) { - struct perf_event_context *ctx; + struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; unsigned long flags; int ctxn, err; @@ -3169,9 +3238,12 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) retry: ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { - unclone_ctx(ctx); + clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); + + if (clone_ctx) + put_ctx(clone_ctx); } else { ctx = alloc_perf_context(pmu, task); err = -ENOMEM; @@ -3323,16 +3395,12 @@ static void free_event(struct perf_event *event) } /* - * Called when the last reference to the file is gone. + * Remove user event from the owner task. */ -static void put_event(struct perf_event *event) +static void perf_remove_from_owner(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; struct task_struct *owner; - if (!atomic_long_dec_and_test(&event->refcount)) - return; - rcu_read_lock(); owner = ACCESS_ONCE(event->owner); /* @@ -3365,6 +3433,20 @@ static void put_event(struct perf_event *event) mutex_unlock(&owner->perf_event_mutex); put_task_struct(owner); } +} + +/* + * Called when the last reference to the file is gone. + */ +static void put_event(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + + if (!atomic_long_dec_and_test(&event->refcount)) + return; + + if (!is_kernel_event(event)) + perf_remove_from_owner(event); WARN_ON_ONCE(ctx->parent_ctx); /* @@ -3399,6 +3481,42 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } +/* + * Remove all orphanes events from the context. + */ +static void orphans_remove_work(struct work_struct *work) +{ + struct perf_event_context *ctx; + struct perf_event *event, *tmp; + + ctx = container_of(work, struct perf_event_context, + orphans_remove.work); + + mutex_lock(&ctx->mutex); + list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { + struct perf_event *parent_event = event->parent; + + if (!is_orphaned_child(event)) + continue; + + perf_remove_from_context(event, true); + + mutex_lock(&parent_event->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent_event->child_mutex); + + free_event(event); + put_event(parent_event); + } + + raw_spin_lock_irq(&ctx->lock); + ctx->orphans_remove_sched = false; + raw_spin_unlock_irq(&ctx->lock); + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); +} + u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; @@ -3496,6 +3614,19 @@ static int perf_event_read_one(struct perf_event *event, return n * sizeof(u64); } +static bool is_event_hup(struct perf_event *event) +{ + bool no_children; + + if (event->state != PERF_EVENT_STATE_EXIT) + return false; + + mutex_lock(&event->child_mutex); + no_children = list_empty(&event->child_list); + mutex_unlock(&event->child_mutex); + return no_children; +} + /* * Read the performance event - simple non blocking version for now */ @@ -3537,7 +3668,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; struct ring_buffer *rb; - unsigned int events = POLL_HUP; + unsigned int events = POLLHUP; + + poll_wait(file, &event->waitq, wait); + + if (is_event_hup(event)) + return events; /* * Pin the event->rb by taking event->mmap_mutex; otherwise @@ -3548,9 +3684,6 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) if (rb) events = atomic_xchg(&rb->poll, 0); mutex_unlock(&event->mmap_mutex); - - poll_wait(file, &event->waitq, wait); - return events; } @@ -4327,22 +4460,29 @@ perf_output_sample_regs(struct perf_output_handle *handle, } } -static void perf_sample_regs_user(struct perf_regs_user *regs_user, - struct pt_regs *regs) +static void perf_sample_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs, + struct pt_regs *regs_user_copy) { - if (!user_mode(regs)) { - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { + if (user_mode(regs)) { + regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; - regs_user->abi = perf_reg_abi(current); + } else if (current->mm) { + perf_get_regs_user(regs_user, regs, regs_user_copy); + } else { + regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; + regs_user->regs = NULL; } } +static void perf_sample_regs_intr(struct perf_regs *regs_intr, + struct pt_regs *regs) +{ + regs_intr->regs = regs; + regs_intr->abi = perf_reg_abi(current); +} + + /* * Get remaining task size from user stack pointer. * @@ -4724,6 +4864,23 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_TRANSACTION) perf_output_put(handle, data->txn); + if (sample_type & PERF_SAMPLE_REGS_INTR) { + u64 abi = data->regs_intr.abi; + /* + * If there are no regs to dump, notice it through + * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). + */ + perf_output_put(handle, abi); + + if (abi) { + u64 mask = event->attr.sample_regs_intr; + + perf_output_sample_regs(handle, + data->regs_intr.regs, + mask); + } + } + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -4789,12 +4946,14 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) + perf_sample_regs_user(&data->regs_user, regs, + &data->regs_user_copy); + if (sample_type & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ int size = sizeof(u64); - perf_sample_regs_user(&data->regs_user, regs); - if (data->regs_user.regs) { u64 mask = event->attr.sample_regs_user; size += hweight64(mask) * sizeof(u64); @@ -4810,15 +4969,11 @@ void perf_prepare_sample(struct perf_event_header *header, * in case new sample type is added, because we could eat * up the rest of the sample size. */ - struct perf_regs_user *uregs = &data->regs_user; u16 stack_size = event->attr.sample_stack_user; u16 size = sizeof(u64); - if (!uregs->abi) - perf_sample_regs_user(uregs, regs); - stack_size = perf_sample_ustack_size(stack_size, header->size, - uregs->regs); + data->regs_user.regs); /* * If there is something to dump, add space for the dump @@ -4831,6 +4986,21 @@ void perf_prepare_sample(struct perf_event_header *header, data->stack_user_size = stack_size; header->size += size; } + + if (sample_type & PERF_SAMPLE_REGS_INTR) { + /* regs dump ABI info */ + int size = sizeof(u64); + + perf_sample_regs_intr(&data->regs_intr, regs); + + if (data->regs_intr.regs) { + u64 mask = event->attr.sample_regs_intr; + + size += hweight64(mask) * sizeof(u64); + } + + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -5702,7 +5872,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, struct perf_sample_data *data, struct pt_regs *regs) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct perf_event *event; struct hlist_head *head; @@ -5721,7 +5891,7 @@ end: int perf_swevent_get_recursion_context(void) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); return get_recursion_context(swhash->recursion); } @@ -5729,7 +5899,7 @@ EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); inline void perf_swevent_put_recursion_context(int rctx) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); put_recursion_context(swhash->recursion, rctx); } @@ -5758,7 +5928,7 @@ static void perf_swevent_read(struct perf_event *event) static int perf_swevent_add(struct perf_event *event, int flags) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct hw_perf_event *hwc = &event->hw; struct hlist_head *head; @@ -5814,7 +5984,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash) if (!hlist) return; - rcu_assign_pointer(swhash->swevent_hlist, NULL); + RCU_INIT_POINTER(swhash->swevent_hlist, NULL); kfree_rcu(hlist, rcu_head); } @@ -5940,11 +6110,6 @@ static int perf_swevent_init(struct perf_event *event) return 0; } -static int perf_swevent_event_idx(struct perf_event *event) -{ - return 0; -} - static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, @@ -5954,8 +6119,6 @@ static struct pmu perf_swevent = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .event_idx = perf_swevent_event_idx, }; #ifdef CONFIG_EVENT_TRACING @@ -6073,8 +6236,6 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .event_idx = perf_swevent_event_idx, }; static inline void perf_tp_register(void) @@ -6300,8 +6461,6 @@ static struct pmu perf_cpu_clock = { .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, - - .event_idx = perf_swevent_event_idx, }; /* @@ -6380,8 +6539,6 @@ static struct pmu perf_task_clock = { .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, - - .event_idx = perf_swevent_event_idx, }; static void perf_pmu_nop_void(struct pmu *pmu) @@ -6411,7 +6568,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) static int perf_event_idx_default(struct perf_event *event) { - return event->hw.idx + 1; + return 0; } /* @@ -7031,6 +7188,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, ret = -EINVAL; } + if (attr->sample_type & PERF_SAMPLE_REGS_INTR) + ret = perf_reg_validate(attr->sample_regs_intr); out: return ret; @@ -7315,11 +7474,11 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) { synchronize_rcu(); - perf_install_in_context(ctx, group_leader, event->cpu); + perf_install_in_context(ctx, group_leader, group_leader->cpu); get_ctx(ctx); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_install_in_context(ctx, sibling, event->cpu); + perf_install_in_context(ctx, sibling, sibling->cpu); get_ctx(ctx); } } @@ -7397,6 +7556,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err; } + /* Mark owner so we could distinguish it from user events. */ + event->owner = EVENT_OWNER_KERNEL; + account_event(event); ctx = find_get_context(event->pmu, task, cpu); @@ -7484,6 +7646,12 @@ static void sync_child_event(struct perf_event *child_event, mutex_unlock(&parent_event->child_mutex); /* + * Make sure user/parent get notified, that we just + * lost one event. + */ + perf_event_wakeup(parent_event); + + /* * Release the parent event, if this was the last * reference to it. */ @@ -7517,13 +7685,16 @@ __perf_event_exit_task(struct perf_event *child_event, if (child_event->parent) { sync_child_event(child_event, child); free_event(child_event); + } else { + child_event->state = PERF_EVENT_STATE_EXIT; + perf_event_wakeup(child_event); } } static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { struct perf_event *child_event, *next; - struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *child_ctx, *clone_ctx = NULL; unsigned long flags; if (likely(!child->perf_event_ctxp[ctxn])) { @@ -7550,28 +7721,16 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) child->perf_event_ctxp[ctxn] = NULL; /* - * In order to avoid freeing: child_ctx->parent_ctx->task - * under perf_event_context::lock, grab another reference. - */ - parent_ctx = child_ctx->parent_ctx; - if (parent_ctx) - get_ctx(parent_ctx); - - /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all * the events from it. */ - unclone_ctx(child_ctx); + clone_ctx = unclone_ctx(child_ctx); update_context_time(child_ctx); raw_spin_unlock_irqrestore(&child_ctx->lock, flags); - /* - * Now that we no longer hold perf_event_context::lock, drop - * our extra child_ctx->parent_ctx reference. - */ - if (parent_ctx) - put_ctx(parent_ctx); + if (clone_ctx) + put_ctx(clone_ctx); /* * Report the task dead after unscheduling the events so that we @@ -7700,6 +7859,7 @@ inherit_event(struct perf_event *parent_event, struct perf_event *group_leader, struct perf_event_context *child_ctx) { + enum perf_event_active_state parent_state = parent_event->state; struct perf_event *child_event; unsigned long flags; @@ -7720,7 +7880,8 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; - if (!atomic_long_inc_not_zero(&parent_event->refcount)) { + if (is_orphaned_event(parent_event) || + !atomic_long_inc_not_zero(&parent_event->refcount)) { free_event(child_event); return NULL; } @@ -7732,7 +7893,7 @@ inherit_event(struct perf_event *parent_event, * not its attr.disabled bit. We hold the parent's mutex, * so we won't race with perf_event_{en, dis}able_family. */ - if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + if (parent_state >= PERF_EVENT_STATE_INACTIVE) child_event->state = PERF_EVENT_STATE_INACTIVE; else child_event->state = PERF_EVENT_STATE_OFF; @@ -7997,7 +8158,7 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) static void __perf_event_exit_context(void *__info) { - struct remove_event re = { .detach_group = false }; + struct remove_event re = { .detach_group = true }; struct perf_event_context *ctx = __info; perf_pmu_rotate_stop(ctx->pmu); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 1559fb0b9296..9803a6600d49 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) bp->hw.state = PERF_HES_STOPPED; } -static int hw_breakpoint_event_idx(struct perf_event *bp) -{ - return 0; -} - static struct pmu perf_breakpoint = { .task_ctx_nr = perf_sw_context, /* could eventually get its own */ @@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, - - .event_idx = hw_breakpoint_event_idx, }; int __init init_hw_breakpoint(void) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1d0af8a2c646..cb346f26a22d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, } flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush(vma, addr, ptep); + ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); page_remove_rmap(page); @@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) int more = 0; again: - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; if (!prev && !more) { /* - * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through + * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), @@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) info->mm = vma->vm_mm; info->vaddr = offset_to_vaddr(vma, offset); } - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_read(mapping); if (!more) goto out; @@ -1640,7 +1640,6 @@ bool uprobe_deny_signal(void) if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED; set_tsk_thread_flag(t, TIF_UPROBE); - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } } diff --git a/kernel/exit.c b/kernel/exit.c index 32c58f7433a3..6806c55475ee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -115,32 +115,30 @@ static void __exit_signal(struct task_struct *tsk) if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); - /* - * Accumulate here the counters for all threads but the - * group leader as they die, so they can be added into - * the process-wide totals when those are taken. - * The group leader stays around as a zombie as long - * as there are other threads. When it gets reaped, - * the exit.c code will add its counts into these totals. - * We won't ever get here for the group leader, since it - * will have been the last reference on the signal_struct. - */ - task_cputime(tsk, &utime, &stime); - sig->utime += utime; - sig->stime += stime; - sig->gtime += task_gtime(tsk); - sig->min_flt += tsk->min_flt; - sig->maj_flt += tsk->maj_flt; - sig->nvcsw += tsk->nvcsw; - sig->nivcsw += tsk->nivcsw; - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; } + /* + * Accumulate here the counters for all threads as they die. We could + * skip the group leader because it is the last user of signal_struct, + * but we want to avoid the race with thread_group_cputime() which can + * see the empty ->thread_head list. + */ + task_cputime(tsk, &utime, &stime); + write_seqlock(&sig->stats_lock); + sig->utime += utime; + sig->stime += stime; + sig->gtime += task_gtime(tsk); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); /* * Do this under ->siglock, we can race with another thread @@ -214,27 +212,6 @@ repeat: } /* - * This checks not only the pgrp, but falls back on the pid if no - * satisfactory pgrp is found. I dunno - gdb doesn't work correctly - * without this... - * - * The caller must hold rcu lock or the tasklist lock. - */ -struct pid *session_of_pgrp(struct pid *pgrp) -{ - struct task_struct *p; - struct pid *sid = NULL; - - p = pid_task(pgrp, PIDTYPE_PGID); - if (p == NULL) - p = pid_task(pgrp, PIDTYPE_PID); - if (p != NULL) - sid = task_session(p); - - return sid; -} - -/* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are @@ -461,6 +438,44 @@ static void exit_mm(struct task_struct *tsk) clear_thread_flag(TIF_MEMDIE); } +static struct task_struct *find_alive_thread(struct task_struct *p) +{ + struct task_struct *t; + + for_each_thread(p, t) { + if (!(t->flags & PF_EXITING)) + return t; + } + return NULL; +} + +static struct task_struct *find_child_reaper(struct task_struct *father) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *reaper = pid_ns->child_reaper; + + if (likely(reaper != father)) + return reaper; + + reaper = find_alive_thread(father); + if (reaper) { + pid_ns->child_reaper = reaper; + return reaper; + } + + write_unlock_irq(&tasklist_lock); + if (unlikely(pid_ns == &init_pid_ns)) { + panic("Attempted to kill init! exitcode=0x%08x\n", + father->signal->group_exit_code ?: father->exit_code); + } + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); + + return father; +} + /* * When we die, we re-parent all our children, and try to: * 1. give them to another thread in our thread group, if such a member exists @@ -468,58 +483,36 @@ static void exit_mm(struct task_struct *tsk) * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ -static struct task_struct *find_new_reaper(struct task_struct *father) - __releases(&tasklist_lock) - __acquires(&tasklist_lock) +static struct task_struct *find_new_reaper(struct task_struct *father, + struct task_struct *child_reaper) { - struct pid_namespace *pid_ns = task_active_pid_ns(father); - struct task_struct *thread; + struct task_struct *thread, *reaper; - thread = father; - while_each_thread(father, thread) { - if (thread->flags & PF_EXITING) - continue; - if (unlikely(pid_ns->child_reaper == father)) - pid_ns->child_reaper = thread; + thread = find_alive_thread(father); + if (thread) return thread; - } - - if (unlikely(pid_ns->child_reaper == father)) { - write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) { - panic("Attempted to kill init! exitcode=0x%08x\n", - father->signal->group_exit_code ?: - father->exit_code); - } - - zap_pid_ns_processes(pid_ns); - write_lock_irq(&tasklist_lock); - } else if (father->signal->has_child_subreaper) { - struct task_struct *reaper; + if (father->signal->has_child_subreaper) { /* - * Find the first ancestor marked as child_subreaper. - * Note that the code below checks same_thread_group(reaper, - * pid_ns->child_reaper). This is what we need to DTRT in a - * PID namespace. However we still need the check above, see - * http://marc.info/?l=linux-kernel&m=131385460420380 + * Find the first ->is_child_subreaper ancestor in our pid_ns. + * We start from father to ensure we can not look into another + * namespace, this is safe because all its threads are dead. */ - for (reaper = father->real_parent; - reaper != &init_task; + for (reaper = father; + !same_thread_group(reaper, child_reaper); reaper = reaper->real_parent) { - if (same_thread_group(reaper, pid_ns->child_reaper)) + /* call_usermodehelper() descendants need this check */ + if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; - thread = reaper; - do { - if (!(thread->flags & PF_EXITING)) - return reaper; - } while_each_thread(reaper, thread); + thread = find_alive_thread(reaper); + if (thread) + return thread; } } - return pid_ns->child_reaper; + return child_reaper; } /* @@ -528,15 +521,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father) static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { - list_move_tail(&p->sibling, &p->real_parent->children); - - if (p->exit_state == EXIT_DEAD) - return; - /* - * If this is a threaded reparent there is no need to - * notify anyone anything has happened. - */ - if (same_thread_group(p->real_parent, father)) + if (unlikely(p->exit_state == EXIT_DEAD)) return; /* We don't want people slaying init. */ @@ -547,49 +532,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; - list_move_tail(&p->sibling, dead); + list_add(&p->ptrace_entry, dead); } } kill_orphaned_pgrp(p, father); } -static void forget_original_parent(struct task_struct *father) +/* + * This does two things: + * + * A. Make init inherit all the child processes + * B. Check to see if any process groups have become orphaned + * as a result of our exiting, and if they have any stopped + * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ +static void forget_original_parent(struct task_struct *father, + struct list_head *dead) { - struct task_struct *p, *n, *reaper; - LIST_HEAD(dead_children); + struct task_struct *p, *t, *reaper; - write_lock_irq(&tasklist_lock); - /* - * Note that exit_ptrace() and find_new_reaper() might - * drop tasklist_lock and reacquire it. - */ - exit_ptrace(father); - reaper = find_new_reaper(father); + if (unlikely(!list_empty(&father->ptraced))) + exit_ptrace(father, dead); - list_for_each_entry_safe(p, n, &father->children, sibling) { - struct task_struct *t = p; + /* Can drop and reacquire tasklist_lock */ + reaper = find_child_reaper(father); + if (list_empty(&father->children)) + return; - do { + reaper = find_new_reaper(father, reaper); + list_for_each_entry(p, &father->children, sibling) { + for_each_thread(p, t) { t->real_parent = reaper; - if (t->parent == father) { - BUG_ON(t->ptrace); + BUG_ON((!t->ptrace) != (t->parent == father)); + if (likely(!t->ptrace)) t->parent = t->real_parent; - } if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t); - } while_each_thread(p, t); - reparent_leader(father, p, &dead_children); - } - write_unlock_irq(&tasklist_lock); - - BUG_ON(!list_empty(&father->children)); - - list_for_each_entry_safe(p, n, &dead_children, sibling) { - list_del_init(&p->sibling); - release_task(p); + } + /* + * If this is a threaded reparent there is no need to + * notify anyone anything has happened. + */ + if (!same_thread_group(reaper, father)) + reparent_leader(father, p, dead); } + list_splice_tail_init(&father->children, &reaper->children); } /* @@ -599,18 +588,12 @@ static void forget_original_parent(struct task_struct *father) static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; - - /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - */ - forget_original_parent(tsk); + struct task_struct *p, *n; + LIST_HEAD(dead); write_lock_irq(&tasklist_lock); + forget_original_parent(tsk, &dead); + if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -628,15 +611,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead) } tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; + if (tsk->exit_state == EXIT_DEAD) + list_add(&tsk->ptrace_entry, &dead); /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); - /* If the process is dead, release it - nobody will wait for it */ - if (autoreap) - release_task(tsk); + list_for_each_entry_safe(p, n, &dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } } #ifdef CONFIG_DEBUG_STACK_USAGE @@ -667,6 +653,7 @@ void do_exit(long code) { struct task_struct *tsk = current; int group_dead; + TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); @@ -775,6 +762,7 @@ void do_exit(long code) */ flush_ptrace_hw_breakpoint(tsk); + TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); exit_notify(tsk, group_dead); proc_exit_connector(tsk); #ifdef CONFIG_NUMA @@ -814,6 +802,7 @@ void do_exit(long code) if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); + TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); /* * The setting of TASK_RUNNING by try_to_wake_up() may be delayed @@ -978,8 +967,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { - unsigned long state; - int retval, status, traced; + int state, retval, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct siginfo __user *infop; @@ -993,6 +981,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) get_task_struct(p); read_unlock(&tasklist_lock); + sched_annotate_sleep(); + if ((exit_code & 0x7f) == 0) { why = CLD_EXITED; status = exit_code >> 8; @@ -1002,21 +992,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) } return wait_noreap_copyout(wo, p, pid, uid, why, status); } - - traced = ptrace_reparented(p); /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ - state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; + state = (ptrace_reparented(p) && thread_group_leader(p)) ? + EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; /* - * It can be ptraced but not reparented, check - * thread_group_leader() to filter out sub-threads. + * We own this thread, nobody else can reap it. */ - if (likely(!traced) && thread_group_leader(p)) { - struct signal_struct *psig; - struct signal_struct *sig; + read_unlock(&tasklist_lock); + sched_annotate_sleep(); + + /* + * Check thread_group_leader() to exclude the traced sub-threads. + */ + if (state == EXIT_DEAD && thread_group_leader(p)) { + struct signal_struct *sig = p->signal; + struct signal_struct *psig = current->signal; unsigned long maxrss; cputime_t tgutime, tgstime; @@ -1028,21 +1022,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these - * p->signal fields, because they are only touched by - * __exit_signal, which runs with tasklist_lock - * write-locked anyway, and so is excluded here. We do - * need to protect the access to parent->signal fields, - * as other threads in the parent group can be right - * here reaping other children at the same time. + * p->signal fields because the whole thread group is dead + * and nobody can change them. + * + * psig->stats_lock also protects us from our sub-theads + * which can reap other children at the same time. Until + * we change k_getrusage()-like users to rely on this lock + * we have to take ->siglock as well. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); - spin_lock_irq(&p->real_parent->sighand->siglock); - psig = p->real_parent->signal; - sig = p->signal; + spin_lock_irq(¤t->sighand->siglock); + write_seqlock(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1065,15 +1059,10 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); - spin_unlock_irq(&p->real_parent->sighand->siglock); + write_sequnlock(&psig->stats_lock); + spin_unlock_irq(¤t->sighand->siglock); } - /* - * Now we are sure this task is interesting, and no other - * thread can reap it because we its state == DEAD/TRACE. - */ - read_unlock(&tasklist_lock); - retval = wo->wo_rusage ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) @@ -1204,6 +1193,7 @@ unlock_sig: pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); + sched_annotate_sleep(); if (unlikely(wo->wo_flags & WNOWAIT)) return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); @@ -1266,6 +1256,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); + sched_annotate_sleep(); if (!wo->wo_info) { retval = wo->wo_rusage @@ -1296,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { + /* + * We can race with wait_task_zombie() from another thread. + * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition + * can't confuse the checks below. + */ + int exit_state = ACCESS_ONCE(p->exit_state); int ret; - if (unlikely(p->exit_state == EXIT_DEAD)) + if (unlikely(exit_state == EXIT_DEAD)) return 0; ret = eligible_child(wo, p); @@ -1319,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } - if (unlikely(p->exit_state == EXIT_TRACE)) { + if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case * we should clear notask_error, debugger will notify us. @@ -1346,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, } /* slay zombie? */ - if (p->exit_state == EXIT_ZOMBIE) { + if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) { /* diff --git a/kernel/extable.c b/kernel/extable.c index d8a6446adbcb..c98f926277a8 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -18,6 +18,7 @@ #include <linux/ftrace.h> #include <linux/memory.h> #include <linux/module.h> +#include <linux/ftrace.h> #include <linux/mutex.h> #include <linux/init.h> @@ -102,6 +103,8 @@ int __kernel_text_address(unsigned long addr) return 1; if (is_module_text_address(addr)) return 1; + if (is_ftrace_trampoline(addr)) + return 1; /* * There might be init symbols in saved stacktraces. * Give those symbols a chance to be printed in @@ -119,7 +122,9 @@ int kernel_text_address(unsigned long addr) { if (core_kernel_text(addr)) return 1; - return is_module_text_address(addr); + if (is_module_text_address(addr)) + return 1; + return is_ftrace_trampoline(addr); } /* diff --git a/kernel/fork.c b/kernel/fork.c index a91e47d86de2..4dc2ddade9f1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst, return 0; } +void set_task_stack_end_magic(struct task_struct *tsk) +{ + unsigned long *stackend; + + stackend = end_of_stack(tsk); + *stackend = STACK_END_MAGIC; /* for overflow detection */ +} + static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; - unsigned long *stackend; int node = tsk_fork_get_node(orig); int err; @@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); - stackend = end_of_stack(tsk); - *stackend = STACK_END_MAGIC; /* for overflow detection */ + set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); @@ -427,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); @@ -439,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) vma_interval_tree_insert_after(tmp, mpnt, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } /* @@ -601,9 +607,8 @@ static void check_mm(struct mm_struct *mm) printk(KERN_ALERT "BUG: Bad rss-counter state " "mm:%p idx:%d val:%ld\n", mm, i, x); } - #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON(mm->pmd_huge_pte); + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif } @@ -1017,11 +1022,14 @@ void __cleanup_sighand(struct sighand_struct *sighand) { if (atomic_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); + /* + * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it + * without an RCU grace period, see __lock_task_sighand(). + */ kmem_cache_free(sighand_cachep, sighand); } } - /* * Initialize POSIX timer handling for a thread group. */ @@ -1068,6 +1076,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); + seqlock_init(&sig->stats_lock); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; diff --git a/kernel/freezer.c b/kernel/freezer.c index aa6a8aadb911..a8900a3bc27a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; + if (test_thread_flag(TIF_MEMDIE)) + return false; + if (pm_nosig_freezing || cgroup_freezing(p)) return true; @@ -147,12 +150,6 @@ void __thaw_task(struct task_struct *p) { unsigned long flags; - /* - * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to - * be visible to @p as waking up implies wmb. Waking up inside - * freezer_lock also prevents wakeups from leaking outside - * refrigerator. - */ spin_lock_irqsave(&freezer_lock, flags); if (frozen(p)) wake_up_process(p); diff --git a/kernel/futex.c b/kernel/futex.c index 815d7af2ffe8..63678b573d61 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -143,9 +143,8 @@ * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see hb_waiters_inc) and where (B) orders the write - * to futex and the waiters read -- this is done by the barriers in - * get_futex_key_refs(), through either ihold or atomic_inc, depending on the - * futex type. + * to futex and the waiters read -- this is done by the barriers for both + * shared and private futexes in get_futex_key_refs(). * * This yields the following case (where X:=waiters, Y:=futex): * @@ -343,12 +342,21 @@ static void get_futex_key_refs(union futex_key *key) case FUT_OFF_MMSHARED: futex_get_mm(key); /* implies MB (B) */ break; + default: + /* + * Private futexes do not hold reference on an inode or + * mm, therefore the only purpose of calling get_futex_key_refs + * is because we need the barrier for the lockless waiter check. + */ + smp_mb(); /* explicit MB (B) */ } } /* * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. + * The hash bucket spinlock must not be held. This is + * a no-op for private futexes, see comment in the get + * counterpart. */ static void drop_futex_key_refs(union futex_key *key) { @@ -639,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void) return pi_state; } +/* + * Must be called with the hb lock held. + */ static void free_pi_state(struct futex_pi_state *pi_state) { + if (!pi_state) + return; + if (!atomic_dec_and_test(&pi_state->refcount)) return; @@ -1519,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, } retry: - if (pi_state != NULL) { - /* - * We will have to lookup the pi_state again, so free this one - * to keep the accounting correct. - */ - free_pi_state(pi_state); - pi_state = NULL; - } - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; @@ -1617,6 +1622,8 @@ retry_private: case 0: break; case -EFAULT: + free_pi_state(pi_state); + pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1632,6 +1639,8 @@ retry_private: * exit to complete. * - The user space value changed. */ + free_pi_state(pi_state); + pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1708,6 +1717,7 @@ retry_private: } out_unlock: + free_pi_state(pi_state); double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); @@ -1725,8 +1735,6 @@ out_put_keys: out_put_key1: put_futex_key(&key1); out: - if (pi_state != NULL) - free_pi_state(pi_state); return ret ? ret : task_count; } diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index d04ce8ac4399..c92e44855ddd 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -32,10 +32,13 @@ config GCOV_KERNEL Note that the debugfs filesystem has to be mounted to access profiling data. +config ARCH_HAS_GCOV_PROFILE_ALL + def_bool n + config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || PPC || MICROBLAZE + depends on ARCH_HAS_GCOV_PROFILE_ALL default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/groups.c b/kernel/groups.c index 451698f86cfa..664411f171b5 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -6,6 +6,7 @@ #include <linux/slab.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/user_namespace.h> #include <asm/uaccess.h> /* init to 2 - one for init_task, one to ensure it is never freed */ @@ -213,6 +214,14 @@ out: return i; } +bool may_setgroups(void) +{ + struct user_namespace *user_ns = current_user_ns(); + + return ns_capable(user_ns, CAP_SETGID) && + userns_may_setgroups(user_ns); +} + /* * SMP: Our groups are copy-on-write. We can set them safely * without another task interfering. @@ -223,7 +232,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!ns_capable(current_user_ns(), CAP_SETGID)) + if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index d269cecdfbf0..9a76e3beda54 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -55,6 +55,24 @@ config GENERIC_IRQ_CHIP config IRQ_DOMAIN bool +# Support for hierarchical irq domains +config IRQ_DOMAIN_HIERARCHY + bool + select IRQ_DOMAIN + +# Generic MSI interrupt support +config GENERIC_MSI_IRQ + bool + +# Generic MSI hierarchical interrupt domain support +config GENERIC_MSI_IRQ_DOMAIN + bool + select IRQ_DOMAIN_HIERARCHY + select GENERIC_MSI_IRQ + +config HANDLE_DOMAIN_IRQ + bool + config IRQ_DOMAIN_DEBUG bool "Expose hardware/virtual IRQ mapping via debugfs" depends on IRQ_DOMAIN && DEBUG_FS diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index fff17381f0af..d12123526e2b 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_PM_SLEEP) += pm.o +obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6223fab9a9d2..6f1c7a566b95 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <linux/irqdomain.h> #include <trace/events/irq.h> @@ -178,6 +179,7 @@ int irq_startup(struct irq_desc *desc, bool resend) irq_state_clr_disabled(desc); desc->depth = 0; + irq_domain_activate_irq(&desc->irq_data); if (desc->irq_data.chip->irq_startup) { ret = desc->irq_data.chip->irq_startup(&desc->irq_data); irq_state_clr_masked(desc); @@ -199,6 +201,7 @@ void irq_shutdown(struct irq_desc *desc) desc->irq_data.chip->irq_disable(&desc->irq_data); else desc->irq_data.chip->irq_mask(&desc->irq_data); + irq_domain_deactivate_irq(&desc->irq_data); irq_state_set_masked(desc); } @@ -342,6 +345,31 @@ static bool irq_check_poll(struct irq_desc *desc) return irq_wait_for_poll(desc); } +static bool irq_may_run(struct irq_desc *desc) +{ + unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED; + + /* + * If the interrupt is not in progress and is not an armed + * wakeup interrupt, proceed. + */ + if (!irqd_has_set(&desc->irq_data, mask)) + return true; + + /* + * If the interrupt is an armed wakeup source, mark it pending + * and suspended, disable it and notify the pm core about the + * event. + */ + if (irq_pm_check_wakeup(desc)) + return false; + + /* + * Handle a potential concurrent poll on a different core. + */ + return irq_check_poll(desc); +} + /** * handle_simple_irq - Simple and software-decoded IRQs. * @irq: the interrupt number @@ -359,9 +387,8 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - if (unlikely(irqd_irq_inprogress(&desc->irq_data))) - if (!irq_check_poll(desc)) - goto out_unlock; + if (!irq_may_run(desc)) + goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -412,9 +439,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); mask_ack_irq(desc); - if (unlikely(irqd_irq_inprogress(&desc->irq_data))) - if (!irq_check_poll(desc)) - goto out_unlock; + if (!irq_may_run(desc)) + goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -485,9 +511,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); - if (unlikely(irqd_irq_inprogress(&desc->irq_data))) - if (!irq_check_poll(desc)) - goto out; + if (!irq_may_run(desc)) + goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -541,19 +566,23 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + + if (!irq_may_run(desc)) { + desc->istate |= IRQS_PENDING; + mask_ack_irq(desc); + goto out_unlock; + } + /* - * If we're currently running this IRQ, or its disabled, - * we shouldn't process the IRQ. Mark it pending, handle - * the necessary masking and go out + * If its disabled or no action available then mask it and get + * out of here. */ - if (unlikely(irqd_irq_disabled(&desc->irq_data) || - irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { - if (!irq_check_poll(desc)) { - desc->istate |= IRQS_PENDING; - mask_ack_irq(desc); - goto out_unlock; - } + if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { + desc->istate |= IRQS_PENDING; + mask_ack_irq(desc); + goto out_unlock; } + kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ @@ -602,18 +631,21 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + + if (!irq_may_run(desc)) { + desc->istate |= IRQS_PENDING; + goto out_eoi; + } + /* - * If we're currently running this IRQ, or its disabled, - * we shouldn't process the IRQ. Mark it pending, handle - * the necessary masking and go out + * If its disabled or no action available then mask it and get + * out of here. */ - if (unlikely(irqd_irq_disabled(&desc->irq_data) || - irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { - if (!irq_check_poll(desc)) { - desc->istate |= IRQS_PENDING; - goto out_eoi; - } + if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { + desc->istate |= IRQS_PENDING; + goto out_eoi; } + kstat_incr_irqs_this_cpu(irq, desc); do { @@ -670,7 +702,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; - void *dev_id = __this_cpu_ptr(action->percpu_dev_id); + void *dev_id = raw_cpu_ptr(action->percpu_dev_id); irqreturn_t res; kstat_incr_irqs_this_cpu(irq, desc); @@ -699,7 +731,30 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (!handle) { handle = handle_bad_irq; } else { - if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) + struct irq_data *irq_data = &desc->irq_data; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + /* + * With hierarchical domains we might run into a + * situation where the outermost chip is not yet set + * up, but the inner chips are there. Instead of + * bailing we install the handler, but obviously we + * cannot enable/startup the interrupt at this point. + */ + while (irq_data) { + if (irq_data->chip != &no_irq_chip) + break; + /* + * Bail out if the outer chip is not set up + * and the interrrupt supposed to be started + * right away. + */ + if (WARN_ON(is_chained)) + goto out; + /* Try the parent */ + irq_data = irq_data->parent_data; + } +#endif + if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip)) goto out; } @@ -818,3 +873,105 @@ void irq_cpu_offline(void) raw_spin_unlock_irqrestore(&desc->lock, flags); } } + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +/** + * irq_chip_ack_parent - Acknowledge the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_ack_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_ack(data); +} + +/** + * irq_chip_mask_parent - Mask the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_mask_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_mask(data); +} + +/** + * irq_chip_unmask_parent - Unmask the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_unmask_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_unmask(data); +} + +/** + * irq_chip_eoi_parent - Invoke EOI on the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_eoi_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_eoi(data); +} + +/** + * irq_chip_set_affinity_parent - Set affinity on the parent interrupt + * @data: Pointer to interrupt specific data + * @dest: The affinity mask to set + * @force: Flag to enforce setting (disable online checks) + * + * Conditinal, as the underlying parent chip might not implement it. + */ +int irq_chip_set_affinity_parent(struct irq_data *data, + const struct cpumask *dest, bool force) +{ + data = data->parent_data; + if (data->chip->irq_set_affinity) + return data->chip->irq_set_affinity(data, dest, force); + + return -ENOSYS; +} + +/** + * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware + * @data: Pointer to interrupt specific data + * + * Iterate through the domain hierarchy of the interrupt and check + * whether a hw retrigger function exists. If yes, invoke it. + */ +int irq_chip_retrigger_hierarchy(struct irq_data *data) +{ + for (data = data->parent_data; data; data = data->parent_data) + if (data->chip && data->chip->irq_retrigger) + return data->chip->irq_retrigger(data); + + return -ENOSYS; +} +#endif + +/** + * irq_chip_compose_msi_msg - Componse msi message for a irq chip + * @data: Pointer to interrupt specific data + * @msg: Pointer to the MSI message + * + * For hierarchical domains we find the first chip in the hierarchy + * which implements the irq_compose_msi_msg callback. For non + * hierarchical we use the top level chip. + */ +int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct irq_data *pos = NULL; + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + for (; data; data = data->parent_data) +#endif + if (data->chip && data->chip->irq_compose_msi_msg) + pos = data; + if (!pos) + return -ENOSYS; + + pos->chip->irq_compose_msi_msg(pos, msg); + + return 0; +} diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 1ef0606797c9..d5d0f7345c54 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -38,7 +38,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data) * * Except for the extra @dev argument, this function takes the * same arguments and performs the same function as - * request_irq(). IRQs requested with this function will be + * request_threaded_irq(). IRQs requested with this function will be * automatically freed on driver detach. * * If an IRQ allocated with this function needs to be freed diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index cf80e7b0ddab..61024e8abdef 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.disable); + irq_reg_writel(gc, mask, ct->regs.disable); *ct->mask_cache &= ~mask; irq_gc_unlock(gc); } @@ -59,7 +59,7 @@ void irq_gc_mask_set_bit(struct irq_data *d) irq_gc_lock(gc); *ct->mask_cache |= mask; - irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); + irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); @@ -79,7 +79,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d) irq_gc_lock(gc); *ct->mask_cache &= ~mask; - irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); + irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); @@ -98,7 +98,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.enable); + irq_reg_writel(gc, mask, ct->regs.enable); *ct->mask_cache |= mask; irq_gc_unlock(gc); } @@ -114,7 +114,7 @@ void irq_gc_ack_set_bit(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.ack); + irq_reg_writel(gc, mask, ct->regs.ack); irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); @@ -130,7 +130,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d) u32 mask = ~d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.ack); + irq_reg_writel(gc, mask, ct->regs.ack); irq_gc_unlock(gc); } @@ -145,8 +145,8 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.mask); - irq_reg_writel(mask, gc->reg_base + ct->regs.ack); + irq_reg_writel(gc, mask, ct->regs.mask); + irq_reg_writel(gc, mask, ct->regs.ack); irq_gc_unlock(gc); } @@ -161,7 +161,7 @@ void irq_gc_eoi(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); + irq_reg_writel(gc, mask, ct->regs.eoi); irq_gc_unlock(gc); } @@ -191,6 +191,16 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) return 0; } +static u32 irq_readl_be(void __iomem *addr) +{ + return ioread32be(addr); +} + +static void irq_writel_be(u32 val, void __iomem *addr) +{ + iowrite32be(val, addr); +} + static void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, int num_ct, unsigned int irq_base, @@ -245,7 +255,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) } ct[i].mask_cache = mskptr; if (flags & IRQ_GC_INIT_MASK_CACHE) - *mskptr = irq_reg_readl(gc->reg_base + mskreg); + *mskptr = irq_reg_readl(gc, mskreg); } } @@ -300,7 +310,13 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, dgc->gc[i] = gc = tmp; irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, NULL, handler); + gc->domain = d; + if (gcflags & IRQ_GC_BE_IO) { + gc->reg_readl = &irq_readl_be; + gc->reg_writel = &irq_writel_be; + } + raw_spin_lock_irqsave(&gc_lock, flags); list_add_tail(&gc->list, &gc_list); raw_spin_unlock_irqrestore(&gc_lock, flags); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 099ea2e0eb88..df553b0af936 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -63,8 +63,8 @@ enum { extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); -extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); -extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); +extern void __disable_irq(struct irq_desc *desc, unsigned int irq); +extern void __enable_irq(struct irq_desc *desc, unsigned int irq); extern int irq_startup(struct irq_desc *desc, bool resend); extern void irq_shutdown(struct irq_desc *desc); @@ -78,8 +78,12 @@ extern void unmask_threaded_irq(struct irq_desc *desc); #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } +extern void irq_lock_sparse(void); +extern void irq_unlock_sparse(void); #else extern void irq_mark_irq(unsigned int irq); +static inline void irq_lock_sparse(void) { } +static inline void irq_unlock_sparse(void) { } #endif extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); @@ -194,3 +198,15 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d __this_cpu_inc(*desc->kstat_irqs); __this_cpu_inc(kstat.irqs_sum); } + +#ifdef CONFIG_PM_SLEEP +bool irq_pm_check_wakeup(struct irq_desc *desc); +void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); +void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action); +#else +static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; } +static inline void +irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } +static inline void +irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } +#endif diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 1487a123db5c..99793b9b6d23 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -14,6 +14,7 @@ #include <linux/kernel_stat.h> #include <linux/radix-tree.h> #include <linux/bitmap.h> +#include <linux/irqdomain.h> #include "internals.h" @@ -131,6 +132,16 @@ static void free_masks(struct irq_desc *desc) static inline void free_masks(struct irq_desc *desc) { } #endif +void irq_lock_sparse(void) +{ + mutex_lock(&sparse_irq_lock); +} + +void irq_unlock_sparse(void) +{ + mutex_unlock(&sparse_irq_lock); +} + static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) { struct irq_desc *desc; @@ -167,6 +178,12 @@ static void free_desc(unsigned int irq) unregister_irq_proc(irq, desc); + /* + * sparse_irq_lock protects also show_interrupts() and + * kstat_irq_usr(). Once we deleted the descriptor from the + * sparse tree we can free it. Access in proc will fail to + * lookup the descriptor. + */ mutex_lock(&sparse_irq_lock); delete_irq_desc(irq); mutex_unlock(&sparse_irq_lock); @@ -336,6 +353,47 @@ int generic_handle_irq(unsigned int irq) } EXPORT_SYMBOL_GPL(generic_handle_irq); +#ifdef CONFIG_HANDLE_DOMAIN_IRQ +/** + * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain + * @domain: The domain where to perform the lookup + * @hwirq: The HW irq number to convert to a logical one + * @lookup: Whether to perform the domain lookup or not + * @regs: Register file coming from the low-level handling code + * + * Returns: 0 on success, or -EINVAL if conversion has failed + */ +int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, + bool lookup, struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + unsigned int irq = hwirq; + int ret = 0; + + irq_enter(); + +#ifdef CONFIG_IRQ_DOMAIN + if (lookup) + irq = irq_find_mapping(domain, hwirq); +#endif + + /* + * Some hardware gives randomly wrong interrupts. Rather + * than crashing, do something sensible. + */ + if (unlikely(!irq || irq >= nr_irqs)) { + ack_bad_irq(irq); + ret = -EINVAL; + } else { + generic_handle_irq(irq); + } + + irq_exit(); + set_irq_regs(old_regs); + return ret; +} +#endif + /* Dynamic interrupt handling */ /** @@ -532,6 +590,15 @@ void kstat_incr_irq_this_cpu(unsigned int irq) kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); } +/** + * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu + * @irq: The interrupt number + * @cpu: The cpu number + * + * Returns the sum of interrupt counts on @cpu since boot for + * @irq. The caller must ensure that the interrupt is not removed + * concurrently. + */ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); @@ -540,6 +607,14 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } +/** + * kstat_irqs - Get the statistics for an interrupt + * @irq: The interrupt number + * + * Returns the sum of interrupt counts on all cpus since boot for + * @irq. The caller must ensure that the interrupt is not removed + * concurrently. + */ unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -552,3 +627,22 @@ unsigned int kstat_irqs(unsigned int irq) sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; } + +/** + * kstat_irqs_usr - Get the statistics for an interrupt + * @irq: The interrupt number + * + * Returns the sum of interrupt counts on all cpus since boot for + * @irq. Contrary to kstat_irqs() this can be called from any + * preemptible context. It's protected against concurrent removal of + * an interrupt descriptor when sparse irqs are enabled. + */ +unsigned int kstat_irqs_usr(unsigned int irq) +{ + int sum; + + irq_lock_sparse(); + sum = kstat_irqs(irq); + irq_unlock_sparse(); + return sum; +} diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 6534ff6ce02e..7fac311057b8 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,6 +23,10 @@ static DEFINE_MUTEX(irq_domain_mutex); static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; +static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, + irq_hw_number_t hwirq, int node); +static void irq_domain_check_hierarchy(struct irq_domain *domain); + /** * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller @@ -30,7 +34,7 @@ static struct irq_domain *irq_default_domain; * @hwirq_max: Maximum number of interrupts supported by controller * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no * direct mapping - * @ops: map/unmap domain callbacks + * @ops: domain callbacks * @host_data: Controller private data pointer * * Allocates and initialize and irq_domain structure. @@ -56,6 +60,7 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, domain->hwirq_max = hwirq_max; domain->revmap_size = size; domain->revmap_direct_max_irq = direct_max; + irq_domain_check_hierarchy(domain); mutex_lock(&irq_domain_mutex); list_add(&domain->link, &irq_domain_list); @@ -109,7 +114,7 @@ EXPORT_SYMBOL_GPL(irq_domain_remove); * @first_irq: first number of irq block assigned to the domain, * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then * pre-map all of the irqs in the domain to virqs starting at first_irq. - * @ops: map/unmap domain callbacks + * @ops: domain callbacks * @host_data: Controller private data pointer * * Allocates an irq_domain, and optionally if first_irq is positive then also @@ -174,10 +179,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, domain = __irq_domain_add(of_node, first_hwirq + size, first_hwirq + size, 0, ops, host_data); - if (!domain) - return NULL; - - irq_domain_associate_many(domain, first_irq, first_hwirq, size); + if (domain) + irq_domain_associate_many(domain, first_irq, first_hwirq, size); return domain; } @@ -388,7 +391,6 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping); unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { - unsigned int hint; int virq; pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); @@ -410,12 +412,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - hint = hwirq % nr_irqs; - if (hint == 0) - hint++; - virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node)); - if (virq <= 0) - virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); + virq = irq_domain_alloc_descs(-1, 1, hwirq, + of_node_to_nid(domain->of_node)); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -471,7 +469,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) struct irq_domain *domain; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; - unsigned int virq; + int virq; domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; if (!domain) { @@ -489,10 +487,24 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) return 0; } - /* Create mapping */ - virq = irq_create_mapping(domain, hwirq); - if (!virq) - return virq; + if (irq_domain_is_hierarchy(domain)) { + /* + * If we've already configured this interrupt, + * don't do it again, or hell will break loose. + */ + virq = irq_find_mapping(domain, hwirq); + if (virq) + return virq; + + virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data); + if (virq <= 0) + return 0; + } else { + /* Create mapping */ + virq = irq_create_mapping(domain, hwirq); + if (!virq) + return virq; + } /* Set type if specified and different than the current one */ if (type != IRQ_TYPE_NONE && @@ -540,8 +552,8 @@ unsigned int irq_find_mapping(struct irq_domain *domain, return 0; if (hwirq < domain->revmap_direct_max_irq) { - data = irq_get_irq_data(hwirq); - if (data && (data->domain == domain) && (data->hwirq == hwirq)) + data = irq_domain_get_irq_data(domain, hwirq); + if (data && data->hwirq == hwirq) return hwirq; } @@ -709,3 +721,518 @@ const struct irq_domain_ops irq_domain_simple_ops = { .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); + +static int irq_domain_alloc_descs(int virq, unsigned int cnt, + irq_hw_number_t hwirq, int node) +{ + unsigned int hint; + + if (virq >= 0) { + virq = irq_alloc_descs(virq, virq, cnt, node); + } else { + hint = hwirq % nr_irqs; + if (hint == 0) + hint++; + virq = irq_alloc_descs_from(hint, cnt, node); + if (virq <= 0 && hint > 1) + virq = irq_alloc_descs_from(1, cnt, node); + } + + return virq; +} + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +/** + * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy + * @parent: Parent irq domain to associate with the new domain + * @flags: Irq domain flags associated to the domain + * @size: Size of the domain. See below + * @node: Optional device-tree node of the interrupt controller + * @ops: Pointer to the interrupt domain callbacks + * @host_data: Controller private data pointer + * + * If @size is 0 a tree domain is created, otherwise a linear domain. + * + * If successful the parent is associated to the new domain and the + * domain flags are set. + * Returns pointer to IRQ domain, or NULL on failure. + */ +struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, + unsigned int flags, + unsigned int size, + struct device_node *node, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain; + + if (size) + domain = irq_domain_add_linear(node, size, ops, host_data); + else + domain = irq_domain_add_tree(node, ops, host_data); + if (domain) { + domain->parent = parent; + domain->flags |= flags; + } + + return domain; +} + +static void irq_domain_insert_irq(int virq) +{ + struct irq_data *data; + + for (data = irq_get_irq_data(virq); data; data = data->parent_data) { + struct irq_domain *domain = data->domain; + irq_hw_number_t hwirq = data->hwirq; + + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = virq; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_insert(&domain->revmap_tree, hwirq, data); + mutex_unlock(&revmap_trees_mutex); + } + + /* If not already assigned, give the domain the chip's name */ + if (!domain->name && data->chip) + domain->name = data->chip->name; + } + + irq_clear_status_flags(virq, IRQ_NOREQUEST); +} + +static void irq_domain_remove_irq(int virq) +{ + struct irq_data *data; + + irq_set_status_flags(virq, IRQ_NOREQUEST); + irq_set_chip_and_handler(virq, NULL, NULL); + synchronize_irq(virq); + smp_mb(); + + for (data = irq_get_irq_data(virq); data; data = data->parent_data) { + struct irq_domain *domain = data->domain; + irq_hw_number_t hwirq = data->hwirq; + + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = 0; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_delete(&domain->revmap_tree, hwirq); + mutex_unlock(&revmap_trees_mutex); + } + } +} + +static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, + struct irq_data *child) +{ + struct irq_data *irq_data; + + irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node); + if (irq_data) { + child->parent_data = irq_data; + irq_data->irq = child->irq; + irq_data->node = child->node; + irq_data->domain = domain; + } + + return irq_data; +} + +static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data, *tmp; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_get_irq_data(virq + i); + tmp = irq_data->parent_data; + irq_data->parent_data = NULL; + irq_data->domain = NULL; + + while (tmp) { + irq_data = tmp; + tmp = tmp->parent_data; + kfree(irq_data); + } + } +} + +static int irq_domain_alloc_irq_data(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data; + struct irq_domain *parent; + int i; + + /* The outermost irq_data is embedded in struct irq_desc */ + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_get_irq_data(virq + i); + irq_data->domain = domain; + + for (parent = domain->parent; parent; parent = parent->parent) { + irq_data = irq_domain_insert_irq_data(parent, irq_data); + if (!irq_data) { + irq_domain_free_irq_data(virq, i + 1); + return -ENOMEM; + } + } + } + + return 0; +} + +/** + * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain + * @domain: domain to match + * @virq: IRQ number to get irq_data + */ +struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, + unsigned int virq) +{ + struct irq_data *irq_data; + + for (irq_data = irq_get_irq_data(virq); irq_data; + irq_data = irq_data->parent_data) + if (irq_data->domain == domain) + return irq_data; + + return NULL; +} + +/** + * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain + * @domain: Interrupt domain to match + * @virq: IRQ number + * @hwirq: The hwirq number + * @chip: The associated interrupt chip + * @chip_data: The associated chip data + */ +int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, struct irq_chip *chip, + void *chip_data) +{ + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); + + if (!irq_data) + return -ENOENT; + + irq_data->hwirq = hwirq; + irq_data->chip = chip ? chip : &no_irq_chip; + irq_data->chip_data = chip_data; + + return 0; +} + +/** + * irq_domain_set_info - Set the complete data for a @virq in @domain + * @domain: Interrupt domain to match + * @virq: IRQ number + * @hwirq: The hardware interrupt number + * @chip: The associated interrupt chip + * @chip_data: The associated interrupt chip data + * @handler: The interrupt flow handler + * @handler_data: The interrupt flow handler data + * @handler_name: The interrupt handler name + */ +void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, struct irq_chip *chip, + void *chip_data, irq_flow_handler_t handler, + void *handler_data, const char *handler_name) +{ + irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data); + __irq_set_handler(virq, handler, 0, handler_name); + irq_set_handler_data(virq, handler_data); +} + +/** + * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data + * @irq_data: The pointer to irq_data + */ +void irq_domain_reset_irq_data(struct irq_data *irq_data) +{ + irq_data->hwirq = 0; + irq_data->chip = &no_irq_chip; + irq_data->chip_data = NULL; +} + +/** + * irq_domain_free_irqs_common - Clear irq_data and free the parent + * @domain: Interrupt domain to match + * @virq: IRQ number to start with + * @nr_irqs: The number of irqs to free + */ +void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *irq_data; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + if (irq_data) + irq_domain_reset_irq_data(irq_data); + } + irq_domain_free_irqs_parent(domain, virq, nr_irqs); +} + +/** + * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent + * @domain: Interrupt domain to match + * @virq: IRQ number to start with + * @nr_irqs: The number of irqs to free + */ +void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_set_handler_data(virq + i, NULL); + irq_set_handler(virq + i, NULL); + } + irq_domain_free_irqs_common(domain, virq, nr_irqs); +} + +static bool irq_domain_is_auto_recursive(struct irq_domain *domain) +{ + return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE; +} + +static void irq_domain_free_irqs_recursive(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs) +{ + domain->ops->free(domain, irq_base, nr_irqs); + if (irq_domain_is_auto_recursive(domain)) { + BUG_ON(!domain->parent); + irq_domain_free_irqs_recursive(domain->parent, irq_base, + nr_irqs); + } +} + +static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs, void *arg) +{ + int ret = 0; + struct irq_domain *parent = domain->parent; + bool recursive = irq_domain_is_auto_recursive(domain); + + BUG_ON(recursive && !parent); + if (recursive) + ret = irq_domain_alloc_irqs_recursive(parent, irq_base, + nr_irqs, arg); + if (ret >= 0) + ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); + if (ret < 0 && recursive) + irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); + + return ret; +} + +/** + * __irq_domain_alloc_irqs - Allocate IRQs from domain + * @domain: domain to allocate from + * @irq_base: allocate specified IRQ nubmer if irq_base >= 0 + * @nr_irqs: number of IRQs to allocate + * @node: NUMA node id for memory allocation + * @arg: domain specific argument + * @realloc: IRQ descriptors have already been allocated if true + * + * Allocate IRQ numbers and initialized all data structures to support + * hierarchy IRQ domains. + * Parameter @realloc is mainly to support legacy IRQs. + * Returns error code or allocated IRQ number + * + * The whole process to setup an IRQ has been split into two steps. + * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ + * descriptor and required hardware resources. The second step, + * irq_domain_activate_irq(), is to program hardwares with preallocated + * resources. In this way, it's easier to rollback when failing to + * allocate resources. + */ +int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc) +{ + int i, ret, virq; + + if (domain == NULL) { + domain = irq_default_domain; + if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n")) + return -EINVAL; + } + + if (!domain->ops->alloc) { + pr_debug("domain->ops->alloc() is NULL\n"); + return -ENOSYS; + } + + if (realloc && irq_base >= 0) { + virq = irq_base; + } else { + virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node); + if (virq < 0) { + pr_debug("cannot allocate IRQ(base %d, count %d)\n", + irq_base, nr_irqs); + return virq; + } + } + + if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) { + pr_debug("cannot allocate memory for IRQ%d\n", virq); + ret = -ENOMEM; + goto out_free_desc; + } + + mutex_lock(&irq_domain_mutex); + ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg); + if (ret < 0) { + mutex_unlock(&irq_domain_mutex); + goto out_free_irq_data; + } + for (i = 0; i < nr_irqs; i++) + irq_domain_insert_irq(virq + i); + mutex_unlock(&irq_domain_mutex); + + return virq; + +out_free_irq_data: + irq_domain_free_irq_data(virq, nr_irqs); +out_free_desc: + irq_free_descs(virq, nr_irqs); + return ret; +} + +/** + * irq_domain_free_irqs - Free IRQ number and associated data structures + * @virq: base IRQ number + * @nr_irqs: number of IRQs to free + */ +void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *data = irq_get_irq_data(virq); + int i; + + if (WARN(!data || !data->domain || !data->domain->ops->free, + "NULL pointer, cannot free irq\n")) + return; + + mutex_lock(&irq_domain_mutex); + for (i = 0; i < nr_irqs; i++) + irq_domain_remove_irq(virq + i); + irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs); + mutex_unlock(&irq_domain_mutex); + + irq_domain_free_irq_data(virq, nr_irqs); + irq_free_descs(virq, nr_irqs); +} + +/** + * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain + * @irq_base: Base IRQ number + * @nr_irqs: Number of IRQs to allocate + * @arg: Allocation data (arch/domain specific) + * + * Check whether the domain has been setup recursive. If not allocate + * through the parent domain. + */ +int irq_domain_alloc_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, unsigned int nr_irqs, + void *arg) +{ + /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */ + if (irq_domain_is_auto_recursive(domain)) + return 0; + + domain = domain->parent; + if (domain) + return irq_domain_alloc_irqs_recursive(domain, irq_base, + nr_irqs, arg); + return -ENOSYS; +} + +/** + * irq_domain_free_irqs_parent - Free interrupts from parent domain + * @irq_base: Base IRQ number + * @nr_irqs: Number of IRQs to free + * + * Check whether the domain has been setup recursive. If not free + * through the parent domain. + */ +void irq_domain_free_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, unsigned int nr_irqs) +{ + /* irq_domain_free_irqs_recursive() will call parent's free */ + if (!irq_domain_is_auto_recursive(domain) && domain->parent) + irq_domain_free_irqs_recursive(domain->parent, irq_base, + nr_irqs); +} + +/** + * irq_domain_activate_irq - Call domain_ops->activate recursively to activate + * interrupt + * @irq_data: outermost irq_data associated with interrupt + * + * This is the second step to call domain_ops->activate to program interrupt + * controllers, so the interrupt could actually get delivered. + */ +void irq_domain_activate_irq(struct irq_data *irq_data) +{ + if (irq_data && irq_data->domain) { + struct irq_domain *domain = irq_data->domain; + + if (irq_data->parent_data) + irq_domain_activate_irq(irq_data->parent_data); + if (domain->ops->activate) + domain->ops->activate(domain, irq_data); + } +} + +/** + * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to + * deactivate interrupt + * @irq_data: outermost irq_data associated with interrupt + * + * It calls domain_ops->deactivate to program interrupt controllers to disable + * interrupt delivery. + */ +void irq_domain_deactivate_irq(struct irq_data *irq_data) +{ + if (irq_data && irq_data->domain) { + struct irq_domain *domain = irq_data->domain; + + if (domain->ops->deactivate) + domain->ops->deactivate(domain, irq_data); + if (irq_data->parent_data) + irq_domain_deactivate_irq(irq_data->parent_data); + } +} + +static void irq_domain_check_hierarchy(struct irq_domain *domain) +{ + /* Hierarchy irq_domains must implement callback alloc() */ + if (domain->ops->alloc) + domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY; +} +#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ +/** + * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain + * @domain: domain to match + * @virq: IRQ number to get irq_data + */ +struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, + unsigned int virq) +{ + struct irq_data *irq_data = irq_get_irq_data(virq); + + return (irq_data && irq_data->domain == domain) ? irq_data : NULL; +} + +static void irq_domain_check_hierarchy(struct irq_domain *domain) +{ +} +#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3dc6a61bf06a..80692373abd6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -183,6 +183,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, ret = chip->irq_set_affinity(data, mask, force); switch (ret) { case IRQ_SET_MASK_OK: + case IRQ_SET_MASK_OK_DONE: cpumask_copy(data->affinity, mask); case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); @@ -382,14 +383,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) } #endif -void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) +void __disable_irq(struct irq_desc *desc, unsigned int irq) { - if (suspend) { - if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) - return; - desc->istate |= IRQS_SUSPENDED; - } - if (!desc->depth++) irq_disable(desc); } @@ -401,7 +396,7 @@ static int __disable_irq_nosync(unsigned int irq) if (!desc) return -EINVAL; - __disable_irq(desc, irq, false); + __disable_irq(desc, irq); irq_put_desc_busunlock(desc, flags); return 0; } @@ -442,20 +437,8 @@ void disable_irq(unsigned int irq) } EXPORT_SYMBOL(disable_irq); -void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) +void __enable_irq(struct irq_desc *desc, unsigned int irq) { - if (resume) { - if (!(desc->istate & IRQS_SUSPENDED)) { - if (!desc->action) - return; - if (!(desc->action->flags & IRQF_FORCE_RESUME)) - return; - /* Pretend that it got disabled ! */ - desc->depth++; - } - desc->istate &= ~IRQS_SUSPENDED; - } - switch (desc->depth) { case 0: err_out: @@ -497,7 +480,7 @@ void enable_irq(unsigned int irq) KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) goto out; - __enable_irq(desc, irq, false); + __enable_irq(desc, irq); out: irq_put_desc_busunlock(desc, flags); } @@ -618,6 +601,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, switch (ret) { case IRQ_SET_MASK_OK: + case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); @@ -1218,6 +1202,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->irq = irq; *old_ptr = new; + irq_pm_install_action(desc, new); + /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; @@ -1228,7 +1214,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { desc->istate &= ~IRQS_SPURIOUS_DISABLED; - __enable_irq(desc, irq, false); + __enable_irq(desc, irq); } raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -1336,6 +1322,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Found it - now remove it from the list of entries: */ *action_ptr = action->next; + irq_pm_remove_action(desc, action); + /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { irq_shutdown(desc); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c new file mode 100644 index 000000000000..3e18163f336f --- /dev/null +++ b/kernel/irq/msi.c @@ -0,0 +1,330 @@ +/* + * linux/kernel/irq/msi.c + * + * Copyright (C) 2014 Intel Corp. + * Author: Jiang Liu <jiang.liu@linux.intel.com> + * + * This file is licensed under GPLv2. + * + * This file contains common code to support Message Signalled Interrupt for + * PCI compatible and non PCI compatible devices. + */ +#include <linux/types.h> +#include <linux/device.h> +#include <linux/irq.h> +#include <linux/irqdomain.h> +#include <linux/msi.h> + +/* Temparory solution for building, will be removed later */ +#include <linux/pci.h> + +void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) +{ + *msg = entry->msg; +} + +void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) +{ + struct msi_desc *entry = irq_get_msi_desc(irq); + + __get_cached_msi_msg(entry, msg); +} +EXPORT_SYMBOL_GPL(get_cached_msi_msg); + +#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN +static inline void irq_chip_write_msi_msg(struct irq_data *data, + struct msi_msg *msg) +{ + data->chip->irq_write_msi_msg(data, msg); +} + +/** + * msi_domain_set_affinity - Generic affinity setter function for MSI domains + * @irq_data: The irq data associated to the interrupt + * @mask: The affinity mask to set + * @force: Flag to enforce setting (disable online checks) + * + * Intended to be used by MSI interrupt controllers which are + * implemented with hierarchical domains. + */ +int msi_domain_set_affinity(struct irq_data *irq_data, + const struct cpumask *mask, bool force) +{ + struct irq_data *parent = irq_data->parent_data; + struct msi_msg msg; + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { + BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); + irq_chip_write_msi_msg(irq_data, &msg); + } + + return ret; +} + +static void msi_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); + irq_chip_write_msi_msg(irq_data, &msg); +} + +static void msi_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + memset(&msg, 0, sizeof(msg)); + irq_chip_write_msi_msg(irq_data, &msg); +} + +static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + irq_hw_number_t hwirq = ops->get_hwirq(info, arg); + int i, ret; + + if (irq_find_mapping(domain, hwirq) > 0) + return -EEXIST; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret < 0) + return ret; + + for (i = 0; i < nr_irqs; i++) { + ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg); + if (ret < 0) { + if (ops->msi_free) { + for (i--; i > 0; i--) + ops->msi_free(domain, info, virq + i); + } + irq_domain_free_irqs_top(domain, virq, nr_irqs); + return ret; + } + } + + return 0; +} + +static void msi_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct msi_domain_info *info = domain->host_data; + int i; + + if (info->ops->msi_free) { + for (i = 0; i < nr_irqs; i++) + info->ops->msi_free(domain, info, virq + i); + } + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +static struct irq_domain_ops msi_domain_ops = { + .alloc = msi_domain_alloc, + .free = msi_domain_free, + .activate = msi_domain_activate, + .deactivate = msi_domain_deactivate, +}; + +#ifdef GENERIC_MSI_DOMAIN_OPS +static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) +{ + return arg->hwirq; +} + +static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + memset(arg, 0, sizeof(*arg)); + return 0; +} + +static void msi_domain_ops_set_desc(msi_alloc_info_t *arg, + struct msi_desc *desc) +{ + arg->desc = desc; +} +#else +#define msi_domain_ops_get_hwirq NULL +#define msi_domain_ops_prepare NULL +#define msi_domain_ops_set_desc NULL +#endif /* !GENERIC_MSI_DOMAIN_OPS */ + +static int msi_domain_ops_init(struct irq_domain *domain, + struct msi_domain_info *info, + unsigned int virq, irq_hw_number_t hwirq, + msi_alloc_info_t *arg) +{ + irq_domain_set_hwirq_and_chip(domain, virq, hwirq, info->chip, + info->chip_data); + if (info->handler && info->handler_name) { + __irq_set_handler(virq, info->handler, 0, info->handler_name); + if (info->handler_data) + irq_set_handler_data(virq, info->handler_data); + } + return 0; +} + +static int msi_domain_ops_check(struct irq_domain *domain, + struct msi_domain_info *info, + struct device *dev) +{ + return 0; +} + +static struct msi_domain_ops msi_domain_ops_default = { + .get_hwirq = msi_domain_ops_get_hwirq, + .msi_init = msi_domain_ops_init, + .msi_check = msi_domain_ops_check, + .msi_prepare = msi_domain_ops_prepare, + .set_desc = msi_domain_ops_set_desc, +}; + +static void msi_domain_update_dom_ops(struct msi_domain_info *info) +{ + struct msi_domain_ops *ops = info->ops; + + if (ops == NULL) { + info->ops = &msi_domain_ops_default; + return; + } + + if (ops->get_hwirq == NULL) + ops->get_hwirq = msi_domain_ops_default.get_hwirq; + if (ops->msi_init == NULL) + ops->msi_init = msi_domain_ops_default.msi_init; + if (ops->msi_check == NULL) + ops->msi_check = msi_domain_ops_default.msi_check; + if (ops->msi_prepare == NULL) + ops->msi_prepare = msi_domain_ops_default.msi_prepare; + if (ops->set_desc == NULL) + ops->set_desc = msi_domain_ops_default.set_desc; +} + +static void msi_domain_update_chip_ops(struct msi_domain_info *info) +{ + struct irq_chip *chip = info->chip; + + BUG_ON(!chip); + if (!chip->irq_mask) + chip->irq_mask = pci_msi_mask_irq; + if (!chip->irq_unmask) + chip->irq_unmask = pci_msi_unmask_irq; + if (!chip->irq_set_affinity) + chip->irq_set_affinity = msi_domain_set_affinity; +} + +/** + * msi_create_irq_domain - Create a MSI interrupt domain + * @of_node: Optional device-tree node of the interrupt controller + * @info: MSI domain info + * @parent: Parent irq domain + */ +struct irq_domain *msi_create_irq_domain(struct device_node *node, + struct msi_domain_info *info, + struct irq_domain *parent) +{ + if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) + msi_domain_update_dom_ops(info); + if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) + msi_domain_update_chip_ops(info); + + return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops, + info); +} + +/** + * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain + * @domain: The domain to allocate from + * @dev: Pointer to device struct of the device for which the interrupts + * are allocated + * @nvec: The number of interrupts to allocate + * + * Returns 0 on success or an error code. + */ +int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, + int nvec) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + msi_alloc_info_t arg; + struct msi_desc *desc; + int i, ret, virq = -1; + + ret = ops->msi_check(domain, info, dev); + if (ret == 0) + ret = ops->msi_prepare(domain, dev, nvec, &arg); + if (ret) + return ret; + + for_each_msi_entry(desc, dev) { + ops->set_desc(&arg, desc); + if (info->flags & MSI_FLAG_IDENTITY_MAP) + virq = (int)ops->get_hwirq(info, &arg); + else + virq = -1; + + virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, + dev_to_node(dev), &arg, false); + if (virq < 0) { + ret = -ENOSPC; + if (ops->handle_error) + ret = ops->handle_error(domain, desc, ret); + if (ops->msi_finish) + ops->msi_finish(&arg, ret); + return ret; + } + + for (i = 0; i < desc->nvec_used; i++) + irq_set_msi_desc_off(virq, i, desc); + } + + if (ops->msi_finish) + ops->msi_finish(&arg, 0); + + for_each_msi_entry(desc, dev) { + if (desc->nvec_used == 1) + dev_dbg(dev, "irq %d for MSI\n", virq); + else + dev_dbg(dev, "irq [%d-%d] for MSI\n", + virq, virq + desc->nvec_used - 1); + } + + return 0; +} + +/** + * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev + * @domain: The domain to managing the interrupts + * @dev: Pointer to device struct of the device for which the interrupts + * are free + */ +void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +{ + struct msi_desc *desc; + + for_each_msi_entry(desc, dev) { + irq_domain_free_irqs(desc->irq, desc->nvec_used); + desc->irq = 0; + } +} + +/** + * msi_get_domain_info - Get the MSI interrupt domain info for @domain + * @domain: The interrupt domain to retrieve data from + * + * Returns the pointer to the msi_domain_info stored in + * @domain->host_data. + */ +struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain) +{ + return (struct msi_domain_info *)domain->host_data; +} + +#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */ diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index abcd6ca86cb7..3ca532592704 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -9,17 +9,105 @@ #include <linux/irq.h> #include <linux/module.h> #include <linux/interrupt.h> +#include <linux/suspend.h> #include <linux/syscore_ops.h> #include "internals.h" +bool irq_pm_check_wakeup(struct irq_desc *desc) +{ + if (irqd_is_wakeup_armed(&desc->irq_data)) { + irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; + desc->depth++; + irq_disable(desc); + pm_system_wakeup(); + return true; + } + return false; +} + +/* + * Called from __setup_irq() with desc->lock held after @action has + * been installed in the action chain. + */ +void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) +{ + desc->nr_actions++; + + if (action->flags & IRQF_FORCE_RESUME) + desc->force_resume_depth++; + + WARN_ON_ONCE(desc->force_resume_depth && + desc->force_resume_depth != desc->nr_actions); + + if (action->flags & IRQF_NO_SUSPEND) + desc->no_suspend_depth++; + + WARN_ON_ONCE(desc->no_suspend_depth && + desc->no_suspend_depth != desc->nr_actions); +} + +/* + * Called from __free_irq() with desc->lock held after @action has + * been removed from the action chain. + */ +void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) +{ + desc->nr_actions--; + + if (action->flags & IRQF_FORCE_RESUME) + desc->force_resume_depth--; + + if (action->flags & IRQF_NO_SUSPEND) + desc->no_suspend_depth--; +} + +static bool suspend_device_irq(struct irq_desc *desc, int irq) +{ + if (!desc->action || desc->no_suspend_depth) + return false; + + if (irqd_is_wakeup_set(&desc->irq_data)) { + irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + /* + * We return true here to force the caller to issue + * synchronize_irq(). We need to make sure that the + * IRQD_WAKEUP_ARMED is visible before we return from + * suspend_device_irqs(). + */ + return true; + } + + desc->istate |= IRQS_SUSPENDED; + __disable_irq(desc, irq); + + /* + * Hardware which has no wakeup source configuration facility + * requires that the non wakeup interrupts are masked at the + * chip level. The chip implementation indicates that with + * IRQCHIP_MASK_ON_SUSPEND. + */ + if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) + mask_irq(desc); + return true; +} + /** * suspend_device_irqs - disable all currently enabled interrupt lines * - * During system-wide suspend or hibernation device drivers need to be prevented - * from receiving interrupts and this function is provided for this purpose. - * It marks all interrupt lines in use, except for the timer ones, as disabled - * and sets the IRQS_SUSPENDED flag for each of them. + * During system-wide suspend or hibernation device drivers need to be + * prevented from receiving interrupts and this function is provided + * for this purpose. + * + * So we disable all interrupts and mark them IRQS_SUSPENDED except + * for those which are unused, those which are marked as not + * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND + * set and those which are marked as active wakeup sources. + * + * The active wakeup sources are handled by the flow handler entry + * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the + * interrupt and notifies the pm core about the wakeup. */ void suspend_device_irqs(void) { @@ -28,18 +116,36 @@ void suspend_device_irqs(void) for_each_irq_desc(irq, desc) { unsigned long flags; + bool sync; raw_spin_lock_irqsave(&desc->lock, flags); - __disable_irq(desc, irq, true); + sync = suspend_device_irq(desc, irq); raw_spin_unlock_irqrestore(&desc->lock, flags); - } - for_each_irq_desc(irq, desc) - if (desc->istate & IRQS_SUSPENDED) + if (sync) synchronize_irq(irq); + } } EXPORT_SYMBOL_GPL(suspend_device_irqs); +static void resume_irq(struct irq_desc *desc, int irq) +{ + irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + + if (desc->istate & IRQS_SUSPENDED) + goto resume; + + /* Force resume the interrupt? */ + if (!desc->force_resume_depth) + return; + + /* Pretend that it got disabled ! */ + desc->depth++; +resume: + desc->istate &= ~IRQS_SUSPENDED; + __enable_irq(desc, irq); +} + static void resume_irqs(bool want_early) { struct irq_desc *desc; @@ -54,7 +160,7 @@ static void resume_irqs(bool want_early) continue; raw_spin_lock_irqsave(&desc->lock, flags); - __enable_irq(desc, irq, true); + resume_irq(desc, irq); raw_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -93,38 +199,3 @@ void resume_device_irqs(void) resume_irqs(false); } EXPORT_SYMBOL_GPL(resume_device_irqs); - -/** - * check_wakeup_irqs - check if any wake-up interrupts are pending - */ -int check_wakeup_irqs(void) -{ - struct irq_desc *desc; - int irq; - - for_each_irq_desc(irq, desc) { - /* - * Only interrupts which are marked as wakeup source - * and have not been disabled before the suspend check - * can abort suspend. - */ - if (irqd_is_wakeup_set(&desc->irq_data)) { - if (desc->depth == 1 && desc->istate & IRQS_PENDING) - return -EBUSY; - continue; - } - /* - * Check the non wakeup interrupts whether they need - * to be masked before finally going into suspend - * state. That's for hardware which has no wakeup - * source configuration facility. The chip - * implementation indicates that with - * IRQCHIP_MASK_ON_SUSPEND. - */ - if (desc->istate & IRQS_SUSPENDED && - irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) - mask_irq(desc); - } - - return 0; -} diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index ac1ba2f11032..9dc9bfd8a678 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -15,6 +15,23 @@ #include "internals.h" +/* + * Access rules: + * + * procfs protects read/write of /proc/irq/N/ files against a + * concurrent free of the interrupt descriptor. remove_proc_entry() + * immediately prevents new read/writes to happen and waits for + * already running read/write functions to complete. + * + * We remove the proc entries first and then delete the interrupt + * descriptor from the radix tree and free it. So it is guaranteed + * that irq_to_desc(N) is valid as long as the read/writes are + * permitted by procfs. + * + * The read from /proc/interrupts is a different problem because there + * is no protection. So the lookup and the access to irqdesc + * information must be protected by sparse_irq_lock. + */ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP @@ -437,9 +454,10 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } + irq_lock_sparse(); desc = irq_to_desc(i); if (!desc) - return 0; + goto outsparse; raw_spin_lock_irqsave(&desc->lock, flags); for_each_online_cpu(j) @@ -479,6 +497,8 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); out: raw_spin_unlock_irqrestore(&desc->lock, flags); +outsparse: + irq_unlock_sparse(); return 0; } #endif diff --git a/kernel/irq_work.c b/kernel/irq_work.c index e6bcbe756663..cbf9fb899d92 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -95,11 +95,11 @@ bool irq_work_queue(struct irq_work *work) /* If the work is "lazy", handle it from next tick if any */ if (work->flags & IRQ_WORK_LAZY) { - if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) && + if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && tick_nohz_tick_stopped()) arch_irq_work_raise(); } else { - if (llist_add(&work->llnode, &__get_cpu_var(raised_list))) + if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) arch_irq_work_raise(); } @@ -113,10 +113,12 @@ bool irq_work_needs_cpu(void) { struct llist_head *raised, *lazy; - raised = &__get_cpu_var(raised_list); - lazy = &__get_cpu_var(lazy_list); - if (llist_empty(raised) && llist_empty(lazy)) - return false; + raised = this_cpu_ptr(&raised_list); + lazy = this_cpu_ptr(&lazy_list); + + if (llist_empty(raised) || arch_irq_work_has_interrupt()) + if (llist_empty(lazy)) + return false; /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); @@ -166,11 +168,20 @@ static void irq_work_run_list(struct llist_head *list) */ void irq_work_run(void) { - irq_work_run_list(&__get_cpu_var(raised_list)); - irq_work_run_list(&__get_cpu_var(lazy_list)); + irq_work_run_list(this_cpu_ptr(&raised_list)); + irq_work_run_list(this_cpu_ptr(&lazy_list)); } EXPORT_SYMBOL_GPL(irq_work_run); +void irq_work_tick(void) +{ + struct llist_head *raised = this_cpu_ptr(&raised_list); + + if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) + irq_work_run_list(raised); + irq_work_run_list(this_cpu_ptr(&lazy_list)); +} + /* * Synchronize against the irq_work @entry, ensures the entry is not * currently in use. diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index ae5167087845..5c5987f10819 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -565,19 +565,12 @@ static int kallsyms_open(struct inode *inode, struct file *file) * using get_symbol_offset for every symbol. */ struct kallsym_iter *iter; - int ret; - - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter)); if (!iter) return -ENOMEM; reset_iter(iter, 0); - ret = seq_open(file, &kallsyms_op); - if (ret == 0) - ((struct seq_file *)file->private_data)->private = iter; - else - kfree(iter); - return ret; + return 0; } #ifdef CONFIG_KGDB_KDB diff --git a/kernel/kexec.c b/kernel/kexec.c index 2bee072268d9..9a8a01abbaed 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, if (!kexec_on_panic) { image->swap_page = kimage_alloc_control_pages(image, 0); if (!image->swap_page) { - pr_err(KERN_ERR "Could not allocate swap buffer\n"); + pr_err("Could not allocate swap buffer\n"); goto out_free_control_pages; } } @@ -1759,7 +1759,6 @@ static __initdata char *suffix_tbl[] = { */ static int __init parse_crashkernel_suffix(char *cmdline, unsigned long long *crash_size, - unsigned long long *crash_base, const char *suffix) { char *cur = cmdline; @@ -1848,7 +1847,7 @@ static int __init __parse_crashkernel(char *cmdline, if (suffix) return parse_crashkernel_suffix(ck_cmdline, crash_size, - crash_base, suffix); + suffix); /* * if the commandline contains a ':', then that's the extended * syntax -- if not, it must be the classic syntax @@ -2016,22 +2015,6 @@ static int __init crash_save_vmcoreinfo_init(void) subsys_initcall(crash_save_vmcoreinfo_init); #ifdef CONFIG_KEXEC_FILE -static int __kexec_add_segment(struct kimage *image, char *buf, - unsigned long bufsz, unsigned long mem, - unsigned long memsz) -{ - struct kexec_segment *ksegment; - - ksegment = &image->segment[image->nr_segments]; - ksegment->kbuf = buf; - ksegment->bufsz = bufsz; - ksegment->mem = mem; - ksegment->memsz = memsz; - image->nr_segments++; - - return 0; -} - static int locate_mem_hole_top_down(unsigned long start, unsigned long end, struct kexec_buf *kbuf) { @@ -2064,8 +2047,7 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end, } while (1); /* If we are here, we found a suitable memory range */ - __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, - kbuf->memsz); + kbuf->mem = temp_start; /* Success, stop navigating through remaining System RAM ranges */ return 1; @@ -2099,8 +2081,7 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, } while (1); /* If we are here, we found a suitable memory range */ - __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, - kbuf->memsz); + kbuf->mem = temp_start; /* Success, stop navigating through remaining System RAM ranges */ return 1; @@ -2187,7 +2168,12 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, } /* Found a suitable memory range */ - ksegment = &image->segment[image->nr_segments - 1]; + ksegment = &image->segment[image->nr_segments]; + ksegment->kbuf = kbuf->buffer; + ksegment->bufsz = kbuf->bufsz; + ksegment->mem = kbuf->mem; + ksegment->memsz = kbuf->memsz; + image->nr_segments++; *load_addr = ksegment->mem; return 0; } diff --git a/kernel/kmod.c b/kernel/kmod.c index 8637e041a247..2777f40a9c7b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -47,13 +47,6 @@ extern int max_threads; static struct workqueue_struct *khelper_wq; -/* - * kmod_thread_locker is used for deadlock avoidance. There is no explicit - * locking to protect this global - it is private to the singleton khelper - * thread and should only ever be modified by that thread. - */ -static const struct task_struct *kmod_thread_locker; - #define CAP_BSET (void *)1 #define CAP_PI (void *)2 @@ -196,6 +189,27 @@ int __request_module(bool wait, const char *fmt, ...) EXPORT_SYMBOL(__request_module); #endif /* CONFIG_MODULES */ +static void call_usermodehelper_freeinfo(struct subprocess_info *info) +{ + if (info->cleanup) + (*info->cleanup)(info); + kfree(info); +} + +static void umh_complete(struct subprocess_info *sub_info) +{ + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away + * or the caller used UMH_NO_WAIT. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); +} + /* * This is the task which runs the usermode application */ @@ -221,7 +235,7 @@ static int ____call_usermodehelper(void *data) retval = -ENOMEM; new = prepare_kernel_cred(current); if (!new) - goto fail; + goto out; spin_lock(&umh_sysctl_lock); new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); @@ -233,7 +247,7 @@ static int ____call_usermodehelper(void *data) retval = sub_info->init(sub_info, new); if (retval) { abort_creds(new); - goto fail; + goto out; } } @@ -242,42 +256,16 @@ static int ____call_usermodehelper(void *data) retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); +out: + sub_info->retval = retval; + /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ + if (!(sub_info->wait & UMH_WAIT_PROC)) + umh_complete(sub_info); if (!retval) return 0; - - /* Exec failed? */ -fail: - sub_info->retval = retval; do_exit(0); } -static int call_helper(void *data) -{ - /* Worker thread started blocking khelper thread. */ - kmod_thread_locker = current; - return ____call_usermodehelper(data); -} - -static void call_usermodehelper_freeinfo(struct subprocess_info *info) -{ - if (info->cleanup) - (*info->cleanup)(info); - kfree(info); -} - -static void umh_complete(struct subprocess_info *sub_info) -{ - struct completion *comp = xchg(&sub_info->complete, NULL); - /* - * See call_usermodehelper_exec(). If xchg() returns NULL - * we own sub_info, the UMH_KILLABLE caller has gone away. - */ - if (comp) - complete(comp); - else - call_usermodehelper_freeinfo(sub_info); -} - /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -320,34 +308,17 @@ static void __call_usermodehelper(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); - int wait = sub_info->wait & ~UMH_KILLABLE; pid_t pid; - /* CLONE_VFORK: wait until the usermode helper has execve'd - * successfully We need the data structures to stay around - * until that is done. */ - if (wait == UMH_WAIT_PROC) + if (sub_info->wait & UMH_WAIT_PROC) pid = kernel_thread(wait_for_helper, sub_info, CLONE_FS | CLONE_FILES | SIGCHLD); - else { - pid = kernel_thread(call_helper, sub_info, - CLONE_VFORK | SIGCHLD); - /* Worker thread stopped blocking khelper thread. */ - kmod_thread_locker = NULL; - } - - switch (wait) { - case UMH_NO_WAIT: - call_usermodehelper_freeinfo(sub_info); - break; + else + pid = kernel_thread(____call_usermodehelper, sub_info, + SIGCHLD); - case UMH_WAIT_PROC: - if (pid > 0) - break; - /* FALLTHROUGH */ - case UMH_WAIT_EXEC: - if (pid < 0) - sub_info->retval = pid; + if (pid < 0) { + sub_info->retval = pid; umh_complete(sub_info); } } @@ -578,17 +549,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) goto out; } /* - * Worker thread must not wait for khelper thread at below - * wait_for_completion() if the thread was created with CLONE_VFORK - * flag, for khelper thread is already waiting for the thread at - * wait_for_completion() in do_fork(). + * Set the completion pointer only if there is a waiter. + * This makes it possible to use umh_complete to free + * the data structure in case of UMH_NO_WAIT. */ - if (wait != UMH_NO_WAIT && current == kmod_thread_locker) { - retval = -EBUSY; - goto out; - } - - sub_info->complete = &done; + sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; sub_info->wait = wait; queue_work(khelper_wq, &sub_info->work); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3995f546d0f3..06f58309fed2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -915,7 +915,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) #ifdef CONFIG_KPROBES_ON_FTRACE static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { .func = kprobe_ftrace_handler, - .flags = FTRACE_OPS_FL_SAVE_REGS, + .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY, }; static int kprobe_ftrace_enabled; @@ -1410,16 +1410,10 @@ static inline int check_kprobe_rereg(struct kprobe *p) return ret; } -static int check_kprobe_address_safe(struct kprobe *p, - struct module **probed_mod) +int __weak arch_check_ftrace_location(struct kprobe *p) { - int ret = 0; unsigned long ftrace_addr; - /* - * If the address is located on a ftrace nop, set the - * breakpoint to the following instruction. - */ ftrace_addr = ftrace_location((unsigned long)p->addr); if (ftrace_addr) { #ifdef CONFIG_KPROBES_ON_FTRACE @@ -1431,7 +1425,17 @@ static int check_kprobe_address_safe(struct kprobe *p, return -EINVAL; #endif } + return 0; +} +static int check_kprobe_address_safe(struct kprobe *p, + struct module **probed_mod) +{ + int ret; + + ret = arch_check_ftrace_location(p); + if (ret) + return ret; jump_label_lock(); preempt_disable(); diff --git a/kernel/kthread.c b/kernel/kthread.c index ef483220e855..10e489c448fe 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), { struct task_struct *p; - p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt, + p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 0955b885d0dc..ec8cce259779 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -20,30 +20,20 @@ * Author: Paul E. McKenney <paulmck@us.ibm.com> * Based on kernel/rcu/torture.c. */ -#include <linux/types.h> #include <linux/kernel.h> -#include <linux/init.h> #include <linux/module.h> #include <linux/kthread.h> -#include <linux/err.h> #include <linux/spinlock.h> +#include <linux/rwlock.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/atomic.h> -#include <linux/bitops.h> -#include <linux/completion.h> #include <linux/moduleparam.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/reboot.h> -#include <linux/freezer.h> -#include <linux/cpu.h> #include <linux/delay.h> -#include <linux/stat.h> #include <linux/slab.h> -#include <linux/trace_clock.h> -#include <asm/byteorder.h> #include <linux/torture.h> MODULE_LICENSE("GPL"); @@ -51,6 +41,8 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); +torture_param(int, nreaders_stress, -1, + "Number of read-locking stress-test threads"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); @@ -66,30 +58,28 @@ torture_param(bool, verbose, true, static char *torture_type = "spin_lock"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, - "Type of lock to torture (spin_lock, spin_lock_irq, ...)"); - -static atomic_t n_lock_torture_errors; + "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)"); static struct task_struct *stats_task; static struct task_struct **writer_tasks; +static struct task_struct **reader_tasks; -static int nrealwriters_stress; static bool lock_is_write_held; +static bool lock_is_read_held; -struct lock_writer_stress_stats { - long n_write_lock_fail; - long n_write_lock_acquired; +struct lock_stress_stats { + long n_lock_fail; + long n_lock_acquired; }; -static struct lock_writer_stress_stats *lwsa; #if defined(MODULE) #define LOCKTORTURE_RUNNABLE_INIT 1 #else #define LOCKTORTURE_RUNNABLE_INIT 0 #endif -int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; -module_param(locktorture_runnable, int, 0444); -MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init"); +int torture_runnable = LOCKTORTURE_RUNNABLE_INIT; +module_param(torture_runnable, int, 0444); +MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); /* Forward reference. */ static void lock_torture_cleanup(void); @@ -102,12 +92,25 @@ struct lock_torture_ops { int (*writelock)(void); void (*write_delay)(struct torture_random_state *trsp); void (*writeunlock)(void); + int (*readlock)(void); + void (*read_delay)(struct torture_random_state *trsp); + void (*readunlock)(void); unsigned long flags; const char *name; }; -static struct lock_torture_ops *cur_ops; - +struct lock_torture_cxt { + int nrealwriters_stress; + int nrealreaders_stress; + bool debug_lock; + atomic_t n_lock_torture_errors; + struct lock_torture_ops *cur_ops; + struct lock_stress_stats *lwsa; /* writer statistics */ + struct lock_stress_stats *lrsa; /* reader statistics */ +}; +static struct lock_torture_cxt cxt = { 0, 0, false, + ATOMIC_INIT(0), + NULL, NULL}; /* * Definitions for lock torture testing. */ @@ -123,10 +126,10 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp) /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (nrealwriters_stress * 2000 * longdelay_us))) + (cxt.nrealwriters_stress * 2000 * longdelay_us))) mdelay(longdelay_us); #ifdef CONFIG_PREEMPT - if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) preempt_schedule(); /* Allow test to be preempted. */ #endif } @@ -140,6 +143,9 @@ static struct lock_torture_ops lock_busted_ops = { .writelock = torture_lock_busted_write_lock, .write_delay = torture_lock_busted_write_delay, .writeunlock = torture_lock_busted_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, .name = "lock_busted" }; @@ -160,13 +166,13 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp) * we want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (nrealwriters_stress * 2000 * longdelay_us))) + (cxt.nrealwriters_stress * 2000 * longdelay_us))) mdelay(longdelay_us); if (!(torture_random(trsp) % - (nrealwriters_stress * 2 * shortdelay_us))) + (cxt.nrealwriters_stress * 2 * shortdelay_us))) udelay(shortdelay_us); #ifdef CONFIG_PREEMPT - if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) preempt_schedule(); /* Allow test to be preempted. */ #endif } @@ -180,39 +186,253 @@ static struct lock_torture_ops spin_lock_ops = { .writelock = torture_spin_lock_write_lock, .write_delay = torture_spin_lock_write_delay, .writeunlock = torture_spin_lock_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, .name = "spin_lock" }; static int torture_spin_lock_write_lock_irq(void) -__acquires(torture_spinlock_irq) +__acquires(torture_spinlock) { unsigned long flags; spin_lock_irqsave(&torture_spinlock, flags); - cur_ops->flags = flags; + cxt.cur_ops->flags = flags; return 0; } static void torture_lock_spin_write_unlock_irq(void) __releases(torture_spinlock) { - spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags); + spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags); } static struct lock_torture_ops spin_lock_irq_ops = { .writelock = torture_spin_lock_write_lock_irq, .write_delay = torture_spin_lock_write_delay, .writeunlock = torture_lock_spin_write_unlock_irq, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, .name = "spin_lock_irq" }; +static DEFINE_RWLOCK(torture_rwlock); + +static int torture_rwlock_write_lock(void) __acquires(torture_rwlock) +{ + write_lock(&torture_rwlock); + return 0; +} + +static void torture_rwlock_write_delay(struct torture_random_state *trsp) +{ + const unsigned long shortdelay_us = 2; + const unsigned long longdelay_ms = 100; + + /* We want a short delay mostly to emulate likely code, and + * we want a long delay occasionally to force massive contention. + */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + else + udelay(shortdelay_us); +} + +static void torture_rwlock_write_unlock(void) __releases(torture_rwlock) +{ + write_unlock(&torture_rwlock); +} + +static int torture_rwlock_read_lock(void) __acquires(torture_rwlock) +{ + read_lock(&torture_rwlock); + return 0; +} + +static void torture_rwlock_read_delay(struct torture_random_state *trsp) +{ + const unsigned long shortdelay_us = 10; + const unsigned long longdelay_ms = 100; + + /* We want a short delay mostly to emulate likely code, and + * we want a long delay occasionally to force massive contention. + */ + if (!(torture_random(trsp) % + (cxt.nrealreaders_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + else + udelay(shortdelay_us); +} + +static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) +{ + read_unlock(&torture_rwlock); +} + +static struct lock_torture_ops rw_lock_ops = { + .writelock = torture_rwlock_write_lock, + .write_delay = torture_rwlock_write_delay, + .writeunlock = torture_rwlock_write_unlock, + .readlock = torture_rwlock_read_lock, + .read_delay = torture_rwlock_read_delay, + .readunlock = torture_rwlock_read_unlock, + .name = "rw_lock" +}; + +static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock) +{ + unsigned long flags; + + write_lock_irqsave(&torture_rwlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_rwlock_write_unlock_irq(void) +__releases(torture_rwlock) +{ + write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); +} + +static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock) +{ + unsigned long flags; + + read_lock_irqsave(&torture_rwlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_rwlock_read_unlock_irq(void) +__releases(torture_rwlock) +{ + write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); +} + +static struct lock_torture_ops rw_lock_irq_ops = { + .writelock = torture_rwlock_write_lock_irq, + .write_delay = torture_rwlock_write_delay, + .writeunlock = torture_rwlock_write_unlock_irq, + .readlock = torture_rwlock_read_lock_irq, + .read_delay = torture_rwlock_read_delay, + .readunlock = torture_rwlock_read_unlock_irq, + .name = "rw_lock_irq" +}; + +static DEFINE_MUTEX(torture_mutex); + +static int torture_mutex_lock(void) __acquires(torture_mutex) +{ + mutex_lock(&torture_mutex); + return 0; +} + +static void torture_mutex_delay(struct torture_random_state *trsp) +{ + const unsigned long longdelay_ms = 100; + + /* We want a long delay occasionally to force massive contention. */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms * 5); + else + mdelay(longdelay_ms / 5); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_mutex_unlock(void) __releases(torture_mutex) +{ + mutex_unlock(&torture_mutex); +} + +static struct lock_torture_ops mutex_lock_ops = { + .writelock = torture_mutex_lock, + .write_delay = torture_mutex_delay, + .writeunlock = torture_mutex_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "mutex_lock" +}; + +static DECLARE_RWSEM(torture_rwsem); +static int torture_rwsem_down_write(void) __acquires(torture_rwsem) +{ + down_write(&torture_rwsem); + return 0; +} + +static void torture_rwsem_write_delay(struct torture_random_state *trsp) +{ + const unsigned long longdelay_ms = 100; + + /* We want a long delay occasionally to force massive contention. */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms * 10); + else + mdelay(longdelay_ms / 10); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_rwsem_up_write(void) __releases(torture_rwsem) +{ + up_write(&torture_rwsem); +} + +static int torture_rwsem_down_read(void) __acquires(torture_rwsem) +{ + down_read(&torture_rwsem); + return 0; +} + +static void torture_rwsem_read_delay(struct torture_random_state *trsp) +{ + const unsigned long longdelay_ms = 100; + + /* We want a long delay occasionally to force massive contention. */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms * 2); + else + mdelay(longdelay_ms / 2); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_rwsem_up_read(void) __releases(torture_rwsem) +{ + up_read(&torture_rwsem); +} + +static struct lock_torture_ops rwsem_lock_ops = { + .writelock = torture_rwsem_down_write, + .write_delay = torture_rwsem_write_delay, + .writeunlock = torture_rwsem_up_write, + .readlock = torture_rwsem_down_read, + .read_delay = torture_rwsem_read_delay, + .readunlock = torture_rwsem_up_read, + .name = "rwsem_lock" +}; + /* * Lock torture writer kthread. Repeatedly acquires and releases * the lock, checking for duplicate acquisitions. */ static int lock_torture_writer(void *arg) { - struct lock_writer_stress_stats *lwsp = arg; + struct lock_stress_stats *lwsp = arg; static DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_writer task started"); @@ -221,14 +441,19 @@ static int lock_torture_writer(void *arg) do { if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); - cur_ops->writelock(); + + cxt.cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) - lwsp->n_write_lock_fail++; + lwsp->n_lock_fail++; lock_is_write_held = 1; - lwsp->n_write_lock_acquired++; - cur_ops->write_delay(&rand); + if (WARN_ON_ONCE(lock_is_read_held)) + lwsp->n_lock_fail++; /* rare, but... */ + + lwsp->n_lock_acquired++; + cxt.cur_ops->write_delay(&rand); lock_is_write_held = 0; - cur_ops->writeunlock(); + cxt.cur_ops->writeunlock(); + stutter_wait("lock_torture_writer"); } while (!torture_must_stop()); torture_kthread_stopping("lock_torture_writer"); @@ -236,32 +461,66 @@ static int lock_torture_writer(void *arg) } /* + * Lock torture reader kthread. Repeatedly acquires and releases + * the reader lock. + */ +static int lock_torture_reader(void *arg) +{ + struct lock_stress_stats *lrsp = arg; + static DEFINE_TORTURE_RANDOM(rand); + + VERBOSE_TOROUT_STRING("lock_torture_reader task started"); + set_user_nice(current, MAX_NICE); + + do { + if ((torture_random(&rand) & 0xfffff) == 0) + schedule_timeout_uninterruptible(1); + + cxt.cur_ops->readlock(); + lock_is_read_held = 1; + if (WARN_ON_ONCE(lock_is_write_held)) + lrsp->n_lock_fail++; /* rare, but... */ + + lrsp->n_lock_acquired++; + cxt.cur_ops->read_delay(&rand); + lock_is_read_held = 0; + cxt.cur_ops->readunlock(); + + stutter_wait("lock_torture_reader"); + } while (!torture_must_stop()); + torture_kthread_stopping("lock_torture_reader"); + return 0; +} + +/* * Create an lock-torture-statistics message in the specified buffer. */ -static void lock_torture_printk(char *page) +static void __torture_print_stats(char *page, + struct lock_stress_stats *statp, bool write) { bool fail = 0; - int i; + int i, n_stress; long max = 0; - long min = lwsa[0].n_write_lock_acquired; + long min = statp[0].n_lock_acquired; long long sum = 0; - for (i = 0; i < nrealwriters_stress; i++) { - if (lwsa[i].n_write_lock_fail) + n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress; + for (i = 0; i < n_stress; i++) { + if (statp[i].n_lock_fail) fail = true; - sum += lwsa[i].n_write_lock_acquired; - if (max < lwsa[i].n_write_lock_fail) - max = lwsa[i].n_write_lock_fail; - if (min > lwsa[i].n_write_lock_fail) - min = lwsa[i].n_write_lock_fail; + sum += statp[i].n_lock_acquired; + if (max < statp[i].n_lock_fail) + max = statp[i].n_lock_fail; + if (min > statp[i].n_lock_fail) + min = statp[i].n_lock_fail; } - page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); page += sprintf(page, - "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", + "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", + write ? "Writes" : "Reads ", sum, max, min, max / 2 > min ? "???" : "", fail, fail ? "!!!" : ""); if (fail) - atomic_inc(&n_lock_torture_errors); + atomic_inc(&cxt.n_lock_torture_errors); } /* @@ -274,18 +533,35 @@ static void lock_torture_printk(char *page) */ static void lock_torture_stats_print(void) { - int size = nrealwriters_stress * 200 + 8192; + int size = cxt.nrealwriters_stress * 200 + 8192; char *buf; + if (cxt.cur_ops->readlock) + size += cxt.nrealreaders_stress * 200 + 8192; + buf = kmalloc(size, GFP_KERNEL); if (!buf) { pr_err("lock_torture_stats_print: Out of memory, need: %d", size); return; } - lock_torture_printk(buf); + + __torture_print_stats(buf, cxt.lwsa, true); pr_alert("%s", buf); kfree(buf); + + if (cxt.cur_ops->readlock) { + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + pr_err("lock_torture_stats_print: Out of memory, need: %d", + size); + return; + } + + __torture_print_stats(buf, cxt.lrsa, false); + pr_alert("%s", buf); + kfree(buf); + } } /* @@ -312,9 +588,10 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, const char *tag) { pr_alert("%s" TORTURE_FLAG - "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", - torture_type, tag, nrealwriters_stress, stat_interval, verbose, - shuffle_interval, stutter, shutdown_secs, + "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", + torture_type, tag, cxt.debug_lock ? " [debug]": "", + cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval, + verbose, shuffle_interval, stutter, shutdown_secs, onoff_interval, onoff_holdoff); } @@ -322,46 +599,59 @@ static void lock_torture_cleanup(void) { int i; - if (torture_cleanup()) + if (torture_cleanup_begin()) return; if (writer_tasks) { - for (i = 0; i < nrealwriters_stress; i++) + for (i = 0; i < cxt.nrealwriters_stress; i++) torture_stop_kthread(lock_torture_writer, writer_tasks[i]); kfree(writer_tasks); writer_tasks = NULL; } + if (reader_tasks) { + for (i = 0; i < cxt.nrealreaders_stress; i++) + torture_stop_kthread(lock_torture_reader, + reader_tasks[i]); + kfree(reader_tasks); + reader_tasks = NULL; + } + torture_stop_kthread(lock_torture_stats, stats_task); lock_torture_stats_print(); /* -After- the stats thread is stopped! */ - if (atomic_read(&n_lock_torture_errors)) - lock_torture_print_module_parms(cur_ops, + if (atomic_read(&cxt.n_lock_torture_errors)) + lock_torture_print_module_parms(cxt.cur_ops, "End of test: FAILURE"); else if (torture_onoff_failures()) - lock_torture_print_module_parms(cur_ops, + lock_torture_print_module_parms(cxt.cur_ops, "End of test: LOCK_HOTPLUG"); else - lock_torture_print_module_parms(cur_ops, + lock_torture_print_module_parms(cxt.cur_ops, "End of test: SUCCESS"); + torture_cleanup_end(); } static int __init lock_torture_init(void) { - int i; + int i, j; int firsterr = 0; static struct lock_torture_ops *torture_ops[] = { - &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, + &lock_busted_ops, + &spin_lock_ops, &spin_lock_irq_ops, + &rw_lock_ops, &rw_lock_irq_ops, + &mutex_lock_ops, + &rwsem_lock_ops, }; - if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) + if (!torture_init_begin(torture_type, verbose, &torture_runnable)) return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { - cur_ops = torture_ops[i]; - if (strcmp(torture_type, cur_ops->name) == 0) + cxt.cur_ops = torture_ops[i]; + if (strcmp(torture_type, cxt.cur_ops->name) == 0) break; } if (i == ARRAY_SIZE(torture_ops)) { @@ -374,31 +664,69 @@ static int __init lock_torture_init(void) torture_init_end(); return -EINVAL; } - if (cur_ops->init) - cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + if (cxt.cur_ops->init) + cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */ if (nwriters_stress >= 0) - nrealwriters_stress = nwriters_stress; + cxt.nrealwriters_stress = nwriters_stress; else - nrealwriters_stress = 2 * num_online_cpus(); - lock_torture_print_module_parms(cur_ops, "Start of test"); + cxt.nrealwriters_stress = 2 * num_online_cpus(); + +#ifdef CONFIG_DEBUG_MUTEXES + if (strncmp(torture_type, "mutex", 5) == 0) + cxt.debug_lock = true; +#endif +#ifdef CONFIG_DEBUG_SPINLOCK + if ((strncmp(torture_type, "spin", 4) == 0) || + (strncmp(torture_type, "rw_lock", 7) == 0)) + cxt.debug_lock = true; +#endif /* Initialize the statistics so that each run gets its own numbers. */ lock_is_write_held = 0; - lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL); - if (lwsa == NULL) { - VERBOSE_TOROUT_STRING("lwsa: Out of memory"); + cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); + if (cxt.lwsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); firsterr = -ENOMEM; goto unwind; } - for (i = 0; i < nrealwriters_stress; i++) { - lwsa[i].n_write_lock_fail = 0; - lwsa[i].n_write_lock_acquired = 0; + for (i = 0; i < cxt.nrealwriters_stress; i++) { + cxt.lwsa[i].n_lock_fail = 0; + cxt.lwsa[i].n_lock_acquired = 0; } - /* Start up the kthreads. */ + if (cxt.cur_ops->readlock) { + if (nreaders_stress >= 0) + cxt.nrealreaders_stress = nreaders_stress; + else { + /* + * By default distribute evenly the number of + * readers and writers. We still run the same number + * of threads as the writer-only locks default. + */ + if (nwriters_stress < 0) /* user doesn't care */ + cxt.nrealwriters_stress = num_online_cpus(); + cxt.nrealreaders_stress = cxt.nrealwriters_stress; + } + + lock_is_read_held = 0; + cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); + if (cxt.lrsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); + firsterr = -ENOMEM; + kfree(cxt.lwsa); + goto unwind; + } + + for (i = 0; i < cxt.nrealreaders_stress; i++) { + cxt.lrsa[i].n_lock_fail = 0; + cxt.lrsa[i].n_lock_acquired = 0; + } + } + lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); + /* Prepare torture context. */ if (onoff_interval > 0) { firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ); @@ -422,18 +750,51 @@ static int __init lock_torture_init(void) goto unwind; } - writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]), + writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), GFP_KERNEL); if (writer_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); firsterr = -ENOMEM; goto unwind; } - for (i = 0; i < nrealwriters_stress; i++) { - firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i], + + if (cxt.cur_ops->readlock) { + reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + } + + /* + * Create the kthreads and start torturing (oh, those poor little locks). + * + * TODO: Note that we interleave writers with readers, giving writers a + * slight advantage, by creating its kthread first. This can be modified + * for very specific needs, or even let the user choose the policy, if + * ever wanted. + */ + for (i = 0, j = 0; i < cxt.nrealwriters_stress || + j < cxt.nrealreaders_stress; i++, j++) { + if (i >= cxt.nrealwriters_stress) + goto create_reader; + + /* Create writer. */ + firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i], writer_tasks[i]); if (firsterr) goto unwind; + + create_reader: + if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress)) + continue; + /* Create reader. */ + firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j], + reader_tasks[j]); + if (firsterr) + goto unwind; } if (stat_interval > 0) { firsterr = torture_create_kthread(lock_torture_stats, NULL, diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 23e89c5930e9..4d60986fcbee 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -56,9 +56,6 @@ do { \ * If the lock has already been acquired, then this will proceed to spin * on this node->locked until the previous lock holder sets the node->locked * in mcs_spin_unlock(). - * - * We don't inline mcs_spin_lock() so that perf can correctly account for the - * time spent in this lock function. */ static inline void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 5cf6731b98e9..3ef3736002d8 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -80,13 +80,13 @@ void debug_mutex_unlock(struct mutex *lock) DEBUG_LOCKS_WARN_ON(lock->owner != current); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); - mutex_clear_owner(lock); } /* * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug * mutexes so that we can do it here after we've verified state. */ + mutex_clear_owner(lock); atomic_set(&lock->count, 1); } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index ae712b25e492..454195194d4a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -15,7 +15,7 @@ * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale * and Sven Dietrich. * - * Also see Documentation/mutex-design.txt. + * Also see Documentation/locking/mutex-design.txt. */ #include <linux/mutex.h> #include <linux/ww_mutex.h> @@ -106,6 +106,92 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, + struct ww_acquire_ctx *ww_ctx) +{ +#ifdef CONFIG_DEBUG_MUTEXES + /* + * If this WARN_ON triggers, you used ww_mutex_lock to acquire, + * but released with a normal mutex_unlock in this call. + * + * This should never happen, always use ww_mutex_unlock. + */ + DEBUG_LOCKS_WARN_ON(ww->ctx); + + /* + * Not quite done after calling ww_acquire_done() ? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); + + if (ww_ctx->contending_lock) { + /* + * After -EDEADLK you tried to + * acquire a different ww_mutex? Bad! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); + + /* + * You called ww_mutex_lock after receiving -EDEADLK, + * but 'forgot' to unlock everything else first? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); + ww_ctx->contending_lock = NULL; + } + + /* + * Naughty, using a different class will lead to undefined behavior! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); +#endif + ww_ctx->acquired++; +} + +/* + * after acquiring lock with fastpath or when we lost out in contested + * slowpath, set ctx and wake up any waiters so they can recheck. + * + * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, + * as the fastpath and opportunistic spinning are disabled in that case. + */ +static __always_inline void +ww_mutex_set_context_fastpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) +{ + unsigned long flags; + struct mutex_waiter *cur; + + ww_mutex_lock_acquired(lock, ctx); + + lock->ctx = ctx; + + /* + * The lock->ctx update should be visible on all cores before + * the atomic read is done, otherwise contended waiters might be + * missed. The contended waiters will either see ww_ctx == NULL + * and keep spinning, or it will acquire wait_lock, add itself + * to waiter list and sleep. + */ + smp_mb(); /* ^^^ */ + + /* + * Check if lock is contended, if not there is nobody to wake up + */ + if (likely(atomic_read(&lock->base.count) == 0)) + return; + + /* + * Uh oh, we raced in fastpath, wake up everyone in this case, + * so they can see the new lock->ctx. + */ + spin_lock_mutex(&lock->base.wait_lock, flags); + list_for_each_entry(cur, &lock->base.wait_list, list) { + debug_mutex_wake_waiter(&lock->base, cur); + wake_up_process(cur->task); + } + spin_unlock_mutex(&lock->base.wait_lock, flags); +} + + #ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* * In order to avoid a stampede of mutex spinners from acquiring the mutex @@ -180,6 +266,135 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) */ return retval; } + +/* + * Atomically try to take the lock when it is available + */ +static inline bool mutex_try_to_acquire(struct mutex *lock) +{ + return !mutex_is_locked(lock) && + (atomic_cmpxchg(&lock->count, 1, 0) == 1); +} + +/* + * Optimistic spinning. + * + * We try to spin for acquisition when we find that the lock owner + * is currently running on a (different) CPU and while we don't + * need to reschedule. The rationale is that if the lock owner is + * running, it is likely to release the lock soon. + * + * Since this needs the lock owner, and this mutex implementation + * doesn't track the owner atomically in the lock field, we need to + * track it non-atomically. + * + * We can't do this for DEBUG_MUTEXES because that relies on wait_lock + * to serialize everything. + * + * The mutex spinners are queued up using MCS lock so that only one + * spinner can compete for the mutex. However, if mutex spinning isn't + * going to happen, there is no point in going through the lock/unlock + * overhead. + * + * Returns true when the lock was taken, otherwise false, indicating + * that we need to jump to the slowpath and sleep. + */ +static bool mutex_optimistic_spin(struct mutex *lock, + struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) +{ + struct task_struct *task = current; + + if (!mutex_can_spin_on_owner(lock)) + goto done; + + if (!osq_lock(&lock->osq)) + goto done; + + while (true) { + struct task_struct *owner; + + if (use_ww_ctx && ww_ctx->acquired > 0) { + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + /* + * If ww->ctx is set the contents are undefined, only + * by acquiring wait_lock there is a guarantee that + * they are not invalid when reading. + * + * As such, when deadlock detection needs to be + * performed the optimistic spinning cannot be done. + */ + if (ACCESS_ONCE(ww->ctx)) + break; + } + + /* + * If there's an owner, wait for it to either + * release the lock or go to sleep. + */ + owner = ACCESS_ONCE(lock->owner); + if (owner && !mutex_spin_on_owner(lock, owner)) + break; + + /* Try to acquire the mutex if it is unlocked. */ + if (mutex_try_to_acquire(lock)) { + lock_acquired(&lock->dep_map, ip); + + if (use_ww_ctx) { + struct ww_mutex *ww; + ww = container_of(lock, struct ww_mutex, base); + + ww_mutex_set_context_fastpath(ww, ww_ctx); + } + + mutex_set_owner(lock); + osq_unlock(&lock->osq); + return true; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(task))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax_lowlatency(); + } + + osq_unlock(&lock->osq); +done: + /* + * If we fell out of the spin path because of need_resched(), + * reschedule now, before we try-lock the mutex. This avoids getting + * scheduled out right after we obtained the mutex. + */ + if (need_resched()) { + /* + * We _should_ have TASK_RUNNING here, but just in case + * we do not, make it so, otherwise we might get stuck. + */ + __set_current_state(TASK_RUNNING); + schedule_preempt_disabled(); + } + + return false; +} +#else +static bool mutex_optimistic_spin(struct mutex *lock, + struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) +{ + return false; +} #endif __visible __used noinline @@ -277,91 +492,6 @@ __mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) return 0; } -static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, - struct ww_acquire_ctx *ww_ctx) -{ -#ifdef CONFIG_DEBUG_MUTEXES - /* - * If this WARN_ON triggers, you used ww_mutex_lock to acquire, - * but released with a normal mutex_unlock in this call. - * - * This should never happen, always use ww_mutex_unlock. - */ - DEBUG_LOCKS_WARN_ON(ww->ctx); - - /* - * Not quite done after calling ww_acquire_done() ? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); - - if (ww_ctx->contending_lock) { - /* - * After -EDEADLK you tried to - * acquire a different ww_mutex? Bad! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); - - /* - * You called ww_mutex_lock after receiving -EDEADLK, - * but 'forgot' to unlock everything else first? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); - ww_ctx->contending_lock = NULL; - } - - /* - * Naughty, using a different class will lead to undefined behavior! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); -#endif - ww_ctx->acquired++; -} - -/* - * after acquiring lock with fastpath or when we lost out in contested - * slowpath, set ctx and wake up any waiters so they can recheck. - * - * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, - * as the fastpath and opportunistic spinning are disabled in that case. - */ -static __always_inline void -ww_mutex_set_context_fastpath(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx) -{ - unsigned long flags; - struct mutex_waiter *cur; - - ww_mutex_lock_acquired(lock, ctx); - - lock->ctx = ctx; - - /* - * The lock->ctx update should be visible on all cores before - * the atomic read is done, otherwise contended waiters might be - * missed. The contended waiters will either see ww_ctx == NULL - * and keep spinning, or it will acquire wait_lock, add itself - * to waiter list and sleep. - */ - smp_mb(); /* ^^^ */ - - /* - * Check if lock is contended, if not there is nobody to wake up - */ - if (likely(atomic_read(&lock->base.count) == 0)) - return; - - /* - * Uh oh, we raced in fastpath, wake up everyone in this case, - * so they can see the new lock->ctx. - */ - spin_lock_mutex(&lock->base.wait_lock, flags); - list_for_each_entry(cur, &lock->base.wait_list, list) { - debug_mutex_wake_waiter(&lock->base, cur); - wake_up_process(cur->task); - } - spin_unlock_mutex(&lock->base.wait_lock, flags); -} - /* * Lock a mutex (possibly interruptible), slowpath: */ @@ -378,104 +508,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - /* - * Optimistic spinning. - * - * We try to spin for acquisition when we find that the lock owner - * is currently running on a (different) CPU and while we don't - * need to reschedule. The rationale is that if the lock owner is - * running, it is likely to release the lock soon. - * - * Since this needs the lock owner, and this mutex implementation - * doesn't track the owner atomically in the lock field, we need to - * track it non-atomically. - * - * We can't do this for DEBUG_MUTEXES because that relies on wait_lock - * to serialize everything. - * - * The mutex spinners are queued up using MCS lock so that only one - * spinner can compete for the mutex. However, if mutex spinning isn't - * going to happen, there is no point in going through the lock/unlock - * overhead. - */ - if (!mutex_can_spin_on_owner(lock)) - goto slowpath; - - if (!osq_lock(&lock->osq)) - goto slowpath; - - for (;;) { - struct task_struct *owner; - - if (use_ww_ctx && ww_ctx->acquired > 0) { - struct ww_mutex *ww; - - ww = container_of(lock, struct ww_mutex, base); - /* - * If ww->ctx is set the contents are undefined, only - * by acquiring wait_lock there is a guarantee that - * they are not invalid when reading. - * - * As such, when deadlock detection needs to be - * performed the optimistic spinning cannot be done. - */ - if (ACCESS_ONCE(ww->ctx)) - break; - } - - /* - * If there's an owner, wait for it to either - * release the lock or go to sleep. - */ - owner = ACCESS_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) - break; - - /* Try to acquire the mutex if it is unlocked. */ - if (!mutex_is_locked(lock) && - (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { - lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx) { - struct ww_mutex *ww; - ww = container_of(lock, struct ww_mutex, base); - - ww_mutex_set_context_fastpath(ww, ww_ctx); - } - - mutex_set_owner(lock); - osq_unlock(&lock->osq); - preempt_enable(); - return 0; - } - - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!owner && (need_resched() || rt_task(task))) - break; - - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ - cpu_relax_lowlatency(); + if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { + /* got the lock, yay! */ + preempt_enable(); + return 0; } - osq_unlock(&lock->osq); -slowpath: - /* - * If we fell out of the spin path because of need_resched(), - * reschedule now, before we try-lock the mutex. This avoids getting - * scheduled out right after we obtained the mutex. - */ - if (need_resched()) - schedule_preempt_disabled(); -#endif + spin_lock_mutex(&lock->wait_lock, flags); /* @@ -679,15 +717,21 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); * Release the lock, slowpath: */ static inline void -__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) +__mutex_unlock_common_slowpath(struct mutex *lock, int nested) { - struct mutex *lock = container_of(lock_count, struct mutex, count); unsigned long flags; /* - * some architectures leave the lock unlocked in the fastpath failure + * As a performance measurement, release the lock before doing other + * wakeup related duties to follow. This allows other tasks to acquire + * the lock sooner, while still handling cleanups in past unlock calls. + * This can be done as we do not enforce strict equivalence between the + * mutex counter and wait_list. + * + * + * Some architectures leave the lock unlocked in the fastpath failure * case, others need to leave it locked. In the later case we have to - * unlock it here + * unlock it here - as the lock counter is currently 0 or negative. */ if (__mutex_slowpath_needs_to_unlock()) atomic_set(&lock->count, 1); @@ -716,7 +760,9 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) __visible void __mutex_unlock_slowpath(atomic_t *lock_count) { - __mutex_unlock_common_slowpath(lock_count, 1); + struct mutex *lock = container_of(lock_count, struct mutex, count); + + __mutex_unlock_common_slowpath(lock, 1); } #ifndef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 4115fbf83b12..5cda397607f2 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -16,7 +16,7 @@ #define mutex_remove_waiter(lock, waiter, ti) \ __list_del((waiter)->list.prev, (waiter)->list.next) -#ifdef CONFIG_SMP +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline void mutex_set_owner(struct mutex *lock) { lock->owner = current; diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index a0ea2a141b3b..7c98873a3077 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -8,7 +8,7 @@ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen * - * See Documentation/rt-mutex-design.txt for details. + * See Documentation/locking/rt-mutex-design.txt for details. */ #include <linux/spinlock.h> #include <linux/export.h> diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index d6203faf2eb1..7628c3fc37ca 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -246,19 +246,22 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) return sem; } +EXPORT_SYMBOL(rwsem_down_read_failed); static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) { - if (!(count & RWSEM_ACTIVE_MASK)) { - /* try acquiring the write lock */ - if (sem->count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, - RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { - if (!list_is_singular(&sem->wait_list)) - rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); - return true; - } + /* + * Try acquiring the write lock. Check count first in order + * to reduce unnecessary expensive cmpxchg() operations. + */ + if (count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, + RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { + if (!list_is_singular(&sem->wait_list)) + rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + return true; } + return false; } @@ -465,6 +468,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) return sem; } +EXPORT_SYMBOL(rwsem_down_write_failed); /* * handle waking up a waiter on the semaphore @@ -485,6 +489,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) return sem; } +EXPORT_SYMBOL(rwsem_wake); /* * downgrade a write lock into a read lock @@ -506,8 +511,4 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) return sem; } - -EXPORT_SYMBOL(rwsem_down_read_failed); -EXPORT_SYMBOL(rwsem_down_write_failed); -EXPORT_SYMBOL(rwsem_wake); EXPORT_SYMBOL(rwsem_downgrade_wake); diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 6815171a4fff..b8120abe594b 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -36,7 +36,7 @@ static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); static noinline int __down_killable(struct semaphore *sem); -static noinline int __down_timeout(struct semaphore *sem, long jiffies); +static noinline int __down_timeout(struct semaphore *sem, long timeout); static noinline void __up(struct semaphore *sem); /** @@ -145,14 +145,14 @@ EXPORT_SYMBOL(down_trylock); /** * down_timeout - acquire the semaphore within a specified time * @sem: the semaphore to be acquired - * @jiffies: how long to wait before failing + * @timeout: how long to wait before failing * * Attempts to acquire the semaphore. If no more tasks are allowed to * acquire the semaphore, calling this function will put the task to sleep. * If the semaphore is not released within the specified number of jiffies, * this function returns -ETIME. It returns 0 if the semaphore was acquired. */ -int down_timeout(struct semaphore *sem, long jiffies) +int down_timeout(struct semaphore *sem, long timeout) { unsigned long flags; int result = 0; @@ -161,7 +161,7 @@ int down_timeout(struct semaphore *sem, long jiffies) if (likely(sem->count > 0)) sem->count--; else - result = __down_timeout(sem, jiffies); + result = __down_timeout(sem, timeout); raw_spin_unlock_irqrestore(&sem->lock, flags); return result; @@ -248,9 +248,9 @@ static noinline int __sched __down_killable(struct semaphore *sem) return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); } -static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) +static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) { - return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); + return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout); } static noinline void __sched __up(struct semaphore *sem) diff --git a/kernel/module.c b/kernel/module.c index 03214bd288e9..3965511ae133 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -42,7 +42,6 @@ #include <linux/vermagic.h> #include <linux/notifier.h> #include <linux/sched.h> -#include <linux/stop_machine.h> #include <linux/device.h> #include <linux/string.h> #include <linux/mutex.h> @@ -98,7 +97,7 @@ * 1) List of modules (also safely readable with preempt_disable), * 2) module_use links, * 3) module_addr_min/module_addr_max. - * (delete uses stop_machine/add uses RCU list operations). */ + * (delete and add uses RCU list operations). */ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); @@ -135,7 +134,7 @@ static int param_set_bool_enable_only(const char *val, } static const struct kernel_param_ops param_ops_bool_enable_only = { - .flags = KERNEL_PARAM_FL_NOARG, + .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool_enable_only, .get = param_get_bool, }; @@ -158,13 +157,13 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list); * Protected by module_mutex. */ static unsigned long module_addr_min = -1UL, module_addr_max = 0; -int register_module_notifier(struct notifier_block * nb) +int register_module_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&module_notify_list, nb); } EXPORT_SYMBOL(register_module_notifier); -int unregister_module_notifier(struct notifier_block * nb) +int unregister_module_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&module_notify_list, nb); } @@ -628,18 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; EXPORT_TRACEPOINT_SYMBOL(module_get); +/* MODULE_REF_BASE is the base reference count by kmodule loader. */ +#define MODULE_REF_BASE 1 + /* Init the unload section of the module. */ static int module_unload_init(struct module *mod) { - mod->refptr = alloc_percpu(struct module_ref); - if (!mod->refptr) - return -ENOMEM; + /* + * Initialize reference counter to MODULE_REF_BASE. + * refcnt == 0 means module is going. + */ + atomic_set(&mod->refcnt, MODULE_REF_BASE); INIT_LIST_HEAD(&mod->source_list); INIT_LIST_HEAD(&mod->target_list); /* Hold reference count during initialization. */ - raw_cpu_write(mod->refptr->incs, 1); + atomic_inc(&mod->refcnt); return 0; } @@ -721,8 +725,6 @@ static void module_unload_free(struct module *mod) kfree(use); } mutex_unlock(&module_mutex); - - free_percpu(mod->refptr); } #ifdef CONFIG_MODULE_FORCE_UNLOAD @@ -740,60 +742,39 @@ static inline int try_force_unload(unsigned int flags) } #endif /* CONFIG_MODULE_FORCE_UNLOAD */ -struct stopref +/* Try to release refcount of module, 0 means success. */ +static int try_release_module_ref(struct module *mod) { - struct module *mod; - int flags; - int *forced; -}; + int ret; -/* Whole machine is stopped with interrupts off when this runs. */ -static int __try_stop_module(void *_sref) -{ - struct stopref *sref = _sref; + /* Try to decrement refcnt which we set at loading */ + ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt); + BUG_ON(ret < 0); + if (ret) + /* Someone can put this right now, recover with checking */ + ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0); + + return ret; +} +static int try_stop_module(struct module *mod, int flags, int *forced) +{ /* If it's not unused, quit unless we're forcing. */ - if (module_refcount(sref->mod) != 0) { - if (!(*sref->forced = try_force_unload(sref->flags))) + if (try_release_module_ref(mod) != 0) { + *forced = try_force_unload(flags); + if (!(*forced)) return -EWOULDBLOCK; } /* Mark it as dying. */ - sref->mod->state = MODULE_STATE_GOING; - return 0; -} + mod->state = MODULE_STATE_GOING; -static int try_stop_module(struct module *mod, int flags, int *forced) -{ - struct stopref sref = { mod, flags, forced }; - - return stop_machine(__try_stop_module, &sref, NULL); + return 0; } unsigned long module_refcount(struct module *mod) { - unsigned long incs = 0, decs = 0; - int cpu; - - for_each_possible_cpu(cpu) - decs += per_cpu_ptr(mod->refptr, cpu)->decs; - /* - * ensure the incs are added up after the decs. - * module_put ensures incs are visible before decs with smp_wmb. - * - * This 2-count scheme avoids the situation where the refcount - * for CPU0 is read, then CPU0 increments the module refcount, - * then CPU1 drops that refcount, then the refcount for CPU1 is - * read. We would record a decrement but not its corresponding - * increment so we would see a low count (disaster). - * - * Rare situation? But module_refcount can be preempted, and we - * might be tallying up 4096+ CPUs. So it is not impossible. - */ - smp_rmb(); - for_each_possible_cpu(cpu) - incs += per_cpu_ptr(mod->refptr, cpu)->incs; - return incs - decs; + return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE; } EXPORT_SYMBOL(module_refcount); @@ -877,8 +858,10 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) seq_printf(m, " %lu ", module_refcount(mod)); - /* Always include a trailing , so userspace can differentiate - between this and the old multi-field proc format. */ + /* + * Always include a trailing , so userspace can differentiate + * between this and the old multi-field proc format. + */ list_for_each_entry(use, &mod->source_list, source_list) { printed_something = 1; seq_printf(m, "%s,", use->source->name); @@ -886,11 +869,11 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) if (mod->init != NULL && mod->exit == NULL) { printed_something = 1; - seq_printf(m, "[permanent],"); + seq_puts(m, "[permanent],"); } if (!printed_something) - seq_printf(m, "-"); + seq_puts(m, "-"); } void __symbol_put(const char *symbol) @@ -935,7 +918,7 @@ void __module_get(struct module *module) { if (module) { preempt_disable(); - __this_cpu_inc(module->refptr->incs); + atomic_inc(&module->refcnt); trace_module_get(module, _RET_IP_); preempt_enable(); } @@ -948,11 +931,11 @@ bool try_module_get(struct module *module) if (module) { preempt_disable(); - - if (likely(module_is_live(module))) { - __this_cpu_inc(module->refptr->incs); + /* Note: here, we can fail to get a reference */ + if (likely(module_is_live(module) && + atomic_inc_not_zero(&module->refcnt) != 0)) trace_module_get(module, _RET_IP_); - } else + else ret = false; preempt_enable(); @@ -963,11 +946,12 @@ EXPORT_SYMBOL(try_module_get); void module_put(struct module *module) { + int ret; + if (module) { preempt_disable(); - smp_wmb(); /* see comment in module_refcount */ - __this_cpu_inc(module->refptr->decs); - + ret = atomic_dec_if_positive(&module->refcnt); + WARN_ON(ret < 0); /* Failed to put refcount */ trace_module_put(module, _RET_IP_); preempt_enable(); } @@ -978,7 +962,7 @@ EXPORT_SYMBOL(module_put); static inline void print_unload_info(struct seq_file *m, struct module *mod) { /* We don't know the usage count, or what modules are using. */ - seq_printf(m, " - -"); + seq_puts(m, " - -"); } static inline void module_unload_free(struct module *mod) @@ -1131,7 +1115,7 @@ static unsigned long maybe_relocated(unsigned long crc, static int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, - struct module *mod, + struct module *mod, const unsigned long *crc, const struct module *crc_owner) { @@ -1165,7 +1149,7 @@ static int check_version(Elf_Shdr *sechdrs, return 0; bad_version: - printk("%s: disagrees about version of symbol %s\n", + pr_warn("%s: disagrees about version of symbol %s\n", mod->name, symname); return 0; } @@ -1200,7 +1184,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, static inline int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, - struct module *mod, + struct module *mod, const unsigned long *crc, const struct module *crc_owner) { @@ -1288,15 +1272,13 @@ static inline bool sect_empty(const Elf_Shdr *sect) return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; } -struct module_sect_attr -{ +struct module_sect_attr { struct module_attribute mattr; char *name; unsigned long address; }; -struct module_sect_attrs -{ +struct module_sect_attrs { struct attribute_group grp; unsigned int nsections; struct module_sect_attr attrs[0]; @@ -1550,7 +1532,8 @@ static int module_add_modinfo_attrs(struct module *mod) (attr->test && attr->test(mod))) { memcpy(temp_attr, attr, sizeof(*temp_attr)); sysfs_attr_init(&temp_attr->attr); - error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); + error = sysfs_create_file(&mod->mkobj.kobj, + &temp_attr->attr); ++temp_attr; } } @@ -1566,7 +1549,7 @@ static void module_remove_modinfo_attrs(struct module *mod) /* pick a field to test for end of list */ if (!attr->attr.name) break; - sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); + sysfs_remove_file(&mod->mkobj.kobj, &attr->attr); if (attr->free) attr->free(mod); } @@ -1697,18 +1680,6 @@ static void mod_sysfs_teardown(struct module *mod) mod_sysfs_fini(mod); } -/* - * unlink the module with the whole machine is stopped with interrupts off - * - this defends against kallsyms not taking locks - */ -static int __unlink_module(void *_mod) -{ - struct module *mod = _mod; - list_del(&mod->list); - module_bug_cleanup(mod); - return 0; -} - #ifdef CONFIG_DEBUG_SET_MODULE_RONX /* * LKM RO/NX protection: protect module's text/ro-data @@ -1842,7 +1813,9 @@ static void free_module(struct module *mod) /* We leave it in list to prevent duplicate loads, but make sure * that noone uses it while it's being deconstructed. */ + mutex_lock(&module_mutex); mod->state = MODULE_STATE_UNFORMED; + mutex_unlock(&module_mutex); /* Remove dynamic debug info */ ddebug_remove_module(mod->name); @@ -1858,7 +1831,12 @@ static void free_module(struct module *mod) /* Now we can delete it from the lists */ mutex_lock(&module_mutex); - stop_machine(__unlink_module, mod, NULL); + /* Unlink carefully: kallsyms could be walking list. */ + list_del_rcu(&mod->list); + /* Remove this module from bug list, this uses list_del_rcu */ + module_bug_cleanup(mod); + /* Wait for RCU synchronizing before releasing mod->list and buglist. */ + synchronize_rcu(); mutex_unlock(&module_mutex); /* This may be NULL, but that's OK */ @@ -1953,7 +1931,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) /* We compiled with -fno-common. These are not supposed to happen. */ pr_debug("Common symbol: %s\n", name); - printk("%s: please compile with -fno-common\n", + pr_warn("%s: please compile with -fno-common\n", mod->name); ret = -ENOEXEC; break; @@ -2257,7 +2235,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, - unsigned int shnum) + unsigned int shnum) { const Elf_Shdr *sec; @@ -2733,7 +2711,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) * This shouldn't happen with same compiler and binutils * building all parts of the module. */ - printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", + pr_warn("%s: has both .ctors and .init_array.\n", mod->name); return -EINVAL; } @@ -3021,8 +2999,10 @@ static int do_init_module(struct module *mod) if (mod->init != NULL) ret = do_one_initcall(mod->init); if (ret < 0) { - /* Init routine failed: abort. Try to protect us from - buggy refcounters. */ + /* + * Init routine failed: abort. Try to protect us from + * buggy refcounters. + */ mod->state = MODULE_STATE_GOING; synchronize_sched(); module_put(mod); @@ -3095,6 +3075,32 @@ static int may_init_module(void) } /* + * Can't use wait_event_interruptible() because our condition + * 'finished_loading()' contains a blocking primitive itself (mutex_lock). + */ +static int wait_finished_loading(struct module *mod) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + add_wait_queue(&module_wq, &wait); + for (;;) { + if (finished_loading(mod->name)) + break; + + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + } + remove_wait_queue(&module_wq, &wait); + + return ret; +} + +/* * We try to place it in the list now to make sure it's unique before * we dedicate too many resources. In particular, temporary percpu * memory exhaustion. @@ -3114,8 +3120,8 @@ again: || old->state == MODULE_STATE_UNFORMED) { /* Wait in case it fails to load. */ mutex_unlock(&module_mutex); - err = wait_event_interruptible(module_wq, - finished_loading(mod->name)); + + err = wait_finished_loading(mod); if (err) goto out_unlocked; goto again; @@ -3174,7 +3180,7 @@ out: static int unknown_module_param_cb(char *param, char *val, const char *modname) { - /* Check for magic 'dyndbg' arg */ + /* Check for magic 'dyndbg' arg */ int ret = ddebug_dyndbg_module_param_cb(param, val, modname); if (ret != 0) pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); @@ -3324,6 +3330,8 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); wake_up_all(&module_wq); + /* Wait for RCU synchronizing before releasing mod->list. */ + synchronize_rcu(); mutex_unlock(&module_mutex); free_module: module_deallocate(mod, info); @@ -3388,7 +3396,7 @@ static inline int is_arm_mapping_symbol(const char *str) { if (str[0] == '.' && str[1] == 'L') return true; - return str[0] == '$' && strchr("atd", str[1]) + return str[0] == '$' && strchr("axtd", str[1]) && (str[2] == '\0' || str[2] == '.'); } @@ -3657,8 +3665,8 @@ static int m_show(struct seq_file *m, void *p) /* Informative for users. */ seq_printf(m, " %s", - mod->state == MODULE_STATE_GOING ? "Unloading": - mod->state == MODULE_STATE_COMING ? "Loading": + mod->state == MODULE_STATE_GOING ? "Unloading" : + mod->state == MODULE_STATE_COMING ? "Loading" : "Live"); /* Used by oprofile and other similar tools. */ seq_printf(m, " 0x%pK", mod->module_core); @@ -3667,7 +3675,7 @@ static int m_show(struct seq_file *m, void *p) if (mod->taints) seq_printf(m, " %s", module_flags(mod, buf)); - seq_printf(m, "\n"); + seq_puts(m, "\n"); return 0; } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ef42d0ab3115..49746c81ad8d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -220,11 +220,10 @@ void exit_task_namespaces(struct task_struct *p) SYSCALL_DEFINE2(setns, int, fd, int, nstype) { - const struct proc_ns_operations *ops; struct task_struct *tsk = current; struct nsproxy *new_nsproxy; - struct proc_ns *ei; struct file *file; + struct ns_common *ns; int err; file = proc_ns_fget(fd); @@ -232,9 +231,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) return PTR_ERR(file); err = -EINVAL; - ei = get_proc_ns(file_inode(file)); - ops = ei->ns_ops; - if (nstype && (ops->type != nstype)) + ns = get_proc_ns(file_inode(file)); + if (nstype && (ns->ops->type != nstype)) goto out; new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); @@ -243,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) goto out; } - err = ops->install(new_nsproxy, ei->ns); + err = ns->ops->install(new_nsproxy, ns); if (err) { free_nsproxy(new_nsproxy); goto out; diff --git a/kernel/panic.c b/kernel/panic.c index d09dc5c32c67..4d8d6f906dec 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -33,6 +33,7 @@ static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); static bool crash_kexec_post_notifiers; +int panic_on_warn __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -244,6 +245,7 @@ static const struct tnt tnts[] = { * 'I' - Working around severe firmware bug. * 'O' - Out-of-tree module has been loaded. * 'E' - Unsigned module has been loaded. + * 'L' - A soft lockup has previously occurred. * * The string is overwritten by the next call to print_tainted(). */ @@ -427,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller, if (args) vprintk(args->fmt, args->args); + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. + * Resetting this prevents additional WARN() from panicking the + * system on this thread. Other threads are blocked by the + * panic_mutex in panic(). + */ + panic_on_warn = 0; + panic("panic_on_warn set ...\n"); + } + print_modules(); dump_stack(); print_oops_end_marker(); @@ -484,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); +core_param(panic_on_warn, panic_on_warn, int, 0644); static int __init setup_crash_kexec_post_notifiers(char *s) { diff --git a/kernel/params.c b/kernel/params.c index 34f527023794..0af9b2c4e56c 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -19,6 +19,7 @@ #include <linux/string.h> #include <linux/errno.h> #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/device.h> #include <linux/err.h> #include <linux/slab.h> @@ -83,6 +84,15 @@ bool parameq(const char *a, const char *b) return parameqn(a, b, strlen(a)+1); } +static void param_check_unsafe(const struct kernel_param *kp) +{ + if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { + pr_warn("Setting dangerous option %s - tainting kernel\n", + kp->name); + add_taint(TAINT_USER, LOCKDEP_STILL_OK); + } +} + static int parse_one(char *param, char *val, const char *doing, @@ -104,11 +114,12 @@ static int parse_one(char *param, return 0; /* No one handled NULL, so do it here. */ if (!val && - !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG)) + !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG)) return -EINVAL; pr_debug("handling %s with %p\n", param, params[i].ops->set); mutex_lock(¶m_lock); + param_check_unsafe(¶ms[i]); err = params[i].ops->set(val, ¶ms[i]); mutex_unlock(¶m_lock); return err; @@ -318,7 +329,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp) EXPORT_SYMBOL(param_get_bool); struct kernel_param_ops param_ops_bool = { - .flags = KERNEL_PARAM_FL_NOARG, + .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool, .get = param_get_bool, }; @@ -369,7 +380,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp) EXPORT_SYMBOL(param_set_bint); struct kernel_param_ops param_ops_bint = { - .flags = KERNEL_PARAM_FL_NOARG, + .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bint, .get = param_get_int, }; @@ -503,8 +514,6 @@ EXPORT_SYMBOL(param_ops_string); #define to_module_attr(n) container_of(n, struct module_attribute, attr) #define to_module_kobject(n) container_of(n, struct module_kobject, kobj) -extern struct kernel_param __start___param[], __stop___param[]; - struct param_attribute { struct module_attribute mattr; @@ -552,6 +561,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr, return -EPERM; mutex_lock(¶m_lock); + param_check_unsafe(attribute->param); err = attribute->param->ops->set(buf, attribute->param); mutex_unlock(¶m_lock); if (!err) @@ -593,74 +603,67 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, const struct kernel_param *kp, const char *name) { - struct module_param_attrs *new; - struct attribute **attrs; - int err, num; + struct module_param_attrs *new_mp; + struct attribute **new_attrs; + unsigned int i; /* We don't bother calling this with invisible parameters. */ BUG_ON(!kp->perm); if (!mk->mp) { - num = 0; - attrs = NULL; - } else { - num = mk->mp->num; - attrs = mk->mp->grp.attrs; + /* First allocation. */ + mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL); + if (!mk->mp) + return -ENOMEM; + mk->mp->grp.name = "parameters"; + /* NULL-terminated attribute array. */ + mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]), + GFP_KERNEL); + /* Caller will cleanup via free_module_param_attrs */ + if (!mk->mp->grp.attrs) + return -ENOMEM; } - /* Enlarge. */ - new = krealloc(mk->mp, - sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), - GFP_KERNEL); - if (!new) { - kfree(attrs); - err = -ENOMEM; - goto fail; - } - /* Despite looking like the typical realloc() bug, this is safe. - * We *want* the old 'attrs' to be freed either way, and we'll store - * the new one in the success case. */ - attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); - if (!attrs) { - err = -ENOMEM; - goto fail_free_new; - } + /* Enlarge allocations. */ + new_mp = krealloc(mk->mp, + sizeof(*mk->mp) + + sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1), + GFP_KERNEL); + if (!new_mp) + return -ENOMEM; + mk->mp = new_mp; - /* Sysfs wants everything zeroed. */ - memset(new, 0, sizeof(*new)); - memset(&new->attrs[num], 0, sizeof(new->attrs[num])); - memset(&attrs[num], 0, sizeof(attrs[num])); - new->grp.name = "parameters"; - new->grp.attrs = attrs; + /* Extra pointer for NULL terminator */ + new_attrs = krealloc(mk->mp->grp.attrs, + sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2), + GFP_KERNEL); + if (!new_attrs) + return -ENOMEM; + mk->mp->grp.attrs = new_attrs; /* Tack new one on the end. */ - sysfs_attr_init(&new->attrs[num].mattr.attr); - new->attrs[num].param = kp; - new->attrs[num].mattr.show = param_attr_show; - new->attrs[num].mattr.store = param_attr_store; - new->attrs[num].mattr.attr.name = (char *)name; - new->attrs[num].mattr.attr.mode = kp->perm; - new->num = num+1; + sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); + mk->mp->attrs[mk->mp->num].param = kp; + mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; + /* Do not allow runtime DAC changes to make param writable. */ + if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) + mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; + mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; + mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; + mk->mp->num++; /* Fix up all the pointers, since krealloc can move us */ - for (num = 0; num < new->num; num++) - new->grp.attrs[num] = &new->attrs[num].mattr.attr; - new->grp.attrs[num] = NULL; - - mk->mp = new; + for (i = 0; i < mk->mp->num; i++) + mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr; + mk->mp->grp.attrs[mk->mp->num] = NULL; return 0; - -fail_free_new: - kfree(new); -fail: - mk->mp = NULL; - return err; } #ifdef CONFIG_MODULES static void free_module_param_attrs(struct module_kobject *mk) { - kfree(mk->mp->grp.attrs); + if (mk->mp) + kfree(mk->mp->grp.attrs); kfree(mk->mp); mk->mp = NULL; } @@ -685,8 +688,10 @@ int module_param_sysfs_setup(struct module *mod, if (kparam[i].perm == 0) continue; err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); - if (err) + if (err) { + free_module_param_attrs(&mod->mkobj); return err; + } params = true; } @@ -763,7 +768,7 @@ static struct module_kobject * __init locate_module_kobject(const char *name) } static void __init kernel_add_sysfs_param(const char *name, - struct kernel_param *kparam, + const struct kernel_param *kparam, unsigned int name_skip) { struct module_kobject *mk; @@ -798,7 +803,7 @@ static void __init kernel_add_sysfs_param(const char *name, */ static void __init param_sysfs_builtin(void) { - struct kernel_param *kp; + const struct kernel_param *kp; unsigned int name_len; char modname[MODULE_NAME_LEN]; diff --git a/kernel/pid.c b/kernel/pid.c index 9b9a26698144..cd36a5e0d173 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -79,7 +79,10 @@ struct pid_namespace init_pid_ns = { .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .proc_inum = PROC_PID_INIT_INO, + .ns.inum = PROC_PID_INIT_INO, +#ifdef CONFIG_PID_NS + .ns.ops = &pidns_operations, +#endif }; EXPORT_SYMBOL_GPL(init_pid_ns); @@ -341,6 +344,8 @@ out: out_unlock: spin_unlock_irq(&pidmap_lock); + put_pid_ns(ns); + out_free: while (++i <= ns->level) free_pidmap(pid->numbers + i); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index db95d8eb761b..a65ba137fd15 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -105,9 +105,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_map; - err = proc_alloc_inum(&ns->proc_inum); + err = ns_alloc_inum(&ns->ns); if (err) goto out_free_map; + ns->ns.ops = &pidns_operations; kref_init(&ns->kref); ns->level = level; @@ -142,7 +143,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) { int i; - proc_free_inum(ns->proc_inum); + ns_free_inum(&ns->ns); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); put_user_ns(ns->user_ns); @@ -190,7 +191,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); - /* Ignore SIGCHLD causing any terminated children to autoreap */ + /* + * Ignore SIGCHLD causing any terminated children to autoreap. + * This speeds up the namespace shutdown, plus see the comment + * below. + */ spin_lock_irq(&me->sighand->siglock); me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; spin_unlock_irq(&me->sighand->siglock); @@ -223,15 +228,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) } read_unlock(&tasklist_lock); - /* Firstly reap the EXIT_ZOMBIE children we may have. */ + /* + * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. + * sys_wait4() will also block until our children traced from the + * parent namespace are detached and become EXIT_DEAD. + */ do { clear_thread_flag(TIF_SIGPENDING); rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); /* - * sys_wait4() above can't reap the TASK_DEAD children. - * Make sure they all go away, see free_pid(). + * sys_wait4() above can't reap the EXIT_DEAD children but we do not + * really care, we could reparent them to the global init. We could + * exit and reap ->child_reaper even if it is not the last thread in + * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(), + * pid_ns can not go away until proc_kill_sb() drops the reference. + * + * But this ns can also have other tasks injected by setns()+fork(). + * Again, ignoring the user visible semantics we do not really need + * to wait until they are all reaped, but they can be reparented to + * us and thus we need to ensure that pid->child_reaper stays valid + * until they all go away. See free_pid()->wake_up_process(). + * + * We rely on ignored SIGCHLD, an injected zombie must be autoreaped + * if reparented. */ for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -313,7 +334,12 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static void *pidns_get(struct task_struct *task) +static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) +{ + return container_of(ns, struct pid_namespace, ns); +} + +static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; @@ -323,18 +349,18 @@ static void *pidns_get(struct task_struct *task) get_pid_ns(ns); rcu_read_unlock(); - return ns; + return ns ? &ns->ns : NULL; } -static void pidns_put(void *ns) +static void pidns_put(struct ns_common *ns) { - put_pid_ns(ns); + put_pid_ns(to_pid_ns(ns)); } -static int pidns_install(struct nsproxy *nsproxy, void *ns) +static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) { struct pid_namespace *active = task_active_pid_ns(current); - struct pid_namespace *ancestor, *new = ns; + struct pid_namespace *ancestor, *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) @@ -362,19 +388,12 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) return 0; } -static unsigned int pidns_inum(void *ns) -{ - struct pid_namespace *pid_ns = ns; - return pid_ns->proc_inum; -} - const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, - .inum = pidns_inum, }; static __init int pid_namespaces_init(void) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index e4e4121fa327..48b28d387c7f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -94,6 +94,7 @@ config PM_STD_PARTITION config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS + select PM config PM_SLEEP_SMP def_bool y @@ -129,24 +130,19 @@ config PM_WAKELOCKS_GC depends on PM_WAKELOCKS default y -config PM_RUNTIME - bool "Run-time PM core functionality" - depends on !IA64_HP_SIM +config PM + bool "Device power management core functionality" ---help--- Enable functionality allowing I/O devices to be put into energy-saving - (low power) states at run time (or autosuspended) after a specified - period of inactivity and woken up in response to a hardware-generated + (low power) states, for example after a specified period of inactivity + (autosuspended), and woken up in response to a hardware-generated wake-up event or a driver's request. Hardware support is generally required for this functionality to work and the bus type drivers of the buses the devices are on are - responsible for the actual handling of the autosuspend requests and + responsible for the actual handling of device suspend requests and wake-up events. -config PM - def_bool y - depends on PM_SLEEP || PM_RUNTIME - config PM_DEBUG bool "Power Management Debug Support" depends on PM @@ -298,10 +294,9 @@ config PM_GENERIC_DOMAINS_SLEEP def_bool y depends on PM_SLEEP && PM_GENERIC_DOMAINS -config PM_GENERIC_DOMAINS_RUNTIME +config PM_GENERIC_DOMAINS_OF def_bool y - depends on PM_RUNTIME && PM_GENERIC_DOMAINS + depends on PM_GENERIC_DOMAINS && OF config CPU_PM bool - depends on SUSPEND || CPU_IDLE diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a9dfa79b6bab..2329daae5255 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -28,6 +28,7 @@ #include <linux/syscore_ops.h> #include <linux/ctype.h> #include <linux/genhd.h> +#include <linux/ktime.h> #include <trace/events/power.h> #include "power.h" @@ -232,20 +233,17 @@ static void platform_recover(int platform_mode) * @nr_pages: Number of memory pages processed between @start and @stop. * @msg: Additional diagnostic message to print. */ -void swsusp_show_speed(struct timeval *start, struct timeval *stop, - unsigned nr_pages, char *msg) +void swsusp_show_speed(ktime_t start, ktime_t stop, + unsigned nr_pages, char *msg) { + ktime_t diff; u64 elapsed_centisecs64; unsigned int centisecs; unsigned int k; unsigned int kps; - elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); - /* - * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time, - * it is obvious enough for what went wrong. - */ - do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); + diff = ktime_sub(stop, start); + elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC); centisecs = elapsed_centisecs64; if (centisecs == 0) centisecs = 1; /* avoid div-by-zero */ @@ -502,8 +500,14 @@ int hibernation_restore(int platform_mode) error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); - dpm_resume_end(PMSG_RECOVER); + /* + * The above should either succeed and jump to the new kernel, + * or return with an error. Otherwise things are just + * undefined, so let's be paranoid. + */ + BUG_ON(!error); } + dpm_resume_end(PMSG_RECOVER); pm_restore_gfp_mask(); resume_console(); pm_restore_console(); diff --git a/kernel/power/power.h b/kernel/power/power.h index 2df883a9d3cb..ce9b8328a689 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -174,8 +174,7 @@ extern int hib_wait_on_bio_chain(struct bio **bio_chain); struct timeval; /* kernel/power/swsusp.c */ -extern void swsusp_show_speed(struct timeval *, struct timeval *, - unsigned int, char *); +extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); #ifdef CONFIG_SUSPEND /* kernel/power/suspend.c */ diff --git a/kernel/power/process.c b/kernel/power/process.c index 4ee194eb524b..5a6ec8678b9a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only) while (true) { todo = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p == current || !freeze_task(p)) continue; if (!freezer_should_skip(p)) todo++; - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); if (!user_only) { @@ -93,11 +93,11 @@ static int try_to_freeze_tasks(bool user_only) if (!wakeup) { read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p != current && !freezer_should_skip(p) && freezing(p) && !frozen(p)) sched_show_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); } } else { @@ -108,6 +108,30 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } +static bool __check_frozen_processes(void) +{ + struct task_struct *g, *p; + + for_each_process_thread(g, p) + if (p != current && !freezer_should_skip(p) && !frozen(p)) + return false; + + return true; +} + +/* + * Returns true if all freezable tasks (except for current) are frozen already + */ +static bool check_frozen_processes(void) +{ + bool ret; + + read_lock(&tasklist_lock); + ret = __check_frozen_processes(); + read_unlock(&tasklist_lock); + return ret; +} + /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls @@ -118,6 +142,7 @@ static int try_to_freeze_tasks(bool user_only) int freeze_processes(void) { int error; + int oom_kills_saved; error = __usermodehelper_disable(UMH_FREEZING); if (error) @@ -129,13 +154,28 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); + pm_wakeup_clear(); printk("Freezing user space processes ... "); pm_freezing = true; + oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); if (!error) { - printk("done."); __usermodehelper_set_disable_depth(UMH_DISABLED); oom_killer_disable(); + + /* + * There might have been an OOM kill while we were + * freezing tasks and the killed task might be still + * on the way out so we have to double check for race. + */ + if (oom_kills_count() != oom_kills_saved && + !check_frozen_processes()) { + __usermodehelper_set_disable_depth(UMH_ENABLED); + printk("OOM in progress."); + error = -EBUSY; + } else { + printk("done."); + } } printk("\n"); BUG_ON(in_atomic()); @@ -190,11 +230,11 @@ void thaw_processes(void) thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); @@ -217,10 +257,10 @@ void thaw_kernel_threads(void) thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); schedule(); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 884b77058864..5f4c006c4b1e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -105,11 +105,27 @@ static struct pm_qos_object network_throughput_pm_qos = { }; +static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier); +static struct pm_qos_constraints memory_bw_constraints = { + .list = PLIST_HEAD_INIT(memory_bw_constraints.list), + .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .type = PM_QOS_SUM, + .notifiers = &memory_bandwidth_notifier, +}; +static struct pm_qos_object memory_bandwidth_pm_qos = { + .constraints = &memory_bw_constraints, + .name = "memory_bandwidth", +}; + + static struct pm_qos_object *pm_qos_array[] = { &null_pm_qos, &cpu_dma_pm_qos, &network_lat_pm_qos, - &network_throughput_pm_qos + &network_throughput_pm_qos, + &memory_bandwidth_pm_qos, }; static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, @@ -130,6 +146,9 @@ static const struct file_operations pm_qos_power_fops = { /* unlocked internal variant */ static inline int pm_qos_get_value(struct pm_qos_constraints *c) { + struct plist_node *node; + int total_value = 0; + if (plist_head_empty(&c->list)) return c->no_constraint_value; @@ -140,6 +159,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) case PM_QOS_MAX: return plist_last(&c->list)->prio; + case PM_QOS_SUM: + plist_for_each(node, &c->list) + total_value += node->prio; + + return total_value; + default: /* runtime check for not using enum */ BUG(); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f1604d8cf489..0c40c16174b4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -28,6 +28,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/compiler.h> +#include <linux/ktime.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -725,6 +726,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) clear_bit(bit, addr); } +static void memory_bm_clear_current(struct memory_bitmap *bm) +{ + int bit; + + bit = max(bm->cur.node_bit - 1, 0); + clear_bit(bit, bm->cur.node->data); +} + static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; @@ -1333,23 +1342,39 @@ static struct memory_bitmap copy_bm; void swsusp_free(void) { - struct zone *zone; - unsigned long pfn, max_zone_pfn; + unsigned long fb_pfn, fr_pfn; - for_each_populated_zone(zone) { - max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - - if (swsusp_page_is_forbidden(page) && - swsusp_page_is_free(page)) { - swsusp_unset_page_forbidden(page); - swsusp_unset_page_free(page); - __free_page(page); - } - } + if (!forbidden_pages_map || !free_pages_map) + goto out; + + memory_bm_position_reset(forbidden_pages_map); + memory_bm_position_reset(free_pages_map); + +loop: + fr_pfn = memory_bm_next_pfn(free_pages_map); + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + + /* + * Find the next bit set in both bitmaps. This is guaranteed to + * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP. + */ + do { + if (fb_pfn < fr_pfn) + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + if (fr_pfn < fb_pfn) + fr_pfn = memory_bm_next_pfn(free_pages_map); + } while (fb_pfn != fr_pfn); + + if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { + struct page *page = pfn_to_page(fr_pfn); + + memory_bm_clear_current(forbidden_pages_map); + memory_bm_clear_current(free_pages_map); + __free_page(page); + goto loop; } + +out: nr_copy_pages = 0; nr_meta_pages = 0; restore_pblist = NULL; @@ -1552,11 +1577,11 @@ int hibernate_preallocate_memory(void) struct zone *zone; unsigned long saveable, size, max_size, count, highmem, pages = 0; unsigned long alloc, save_highmem, pages_highmem, avail_normal; - struct timeval start, stop; + ktime_t start, stop; int error; printk(KERN_INFO "PM: Preallocating image memory... "); - do_gettimeofday(&start); + start = ktime_get(); error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); if (error) @@ -1685,9 +1710,9 @@ int hibernate_preallocate_memory(void) free_unnecessary_pages(); out: - do_gettimeofday(&stop); + stop = ktime_get(); printk(KERN_CONT "done (allocated %lu pages)\n", pages); - swsusp_show_speed(&start, &stop, pages, "Allocated"); + swsusp_show_speed(start, stop, pages, "Allocated"); return 0; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 18c62195660f..c347e3ce3a55 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -146,17 +146,29 @@ static int platform_suspend_prepare(suspend_state_t state) static int platform_suspend_prepare_late(suspend_state_t state) { + return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ? + freeze_ops->prepare() : 0; +} + +static int platform_suspend_prepare_noirq(suspend_state_t state) +{ return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? suspend_ops->prepare_late() : 0; } -static void platform_suspend_wake(suspend_state_t state) +static void platform_resume_noirq(suspend_state_t state) { if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) suspend_ops->wake(); } -static void platform_suspend_finish(suspend_state_t state) +static void platform_resume_early(suspend_state_t state) +{ + if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore) + freeze_ops->restore(); +} + +static void platform_resume_finish(suspend_state_t state) { if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) suspend_ops->finish(); @@ -172,7 +184,7 @@ static int platform_suspend_begin(suspend_state_t state) return 0; } -static void platform_suspend_end(suspend_state_t state) +static void platform_resume_end(suspend_state_t state) { if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) freeze_ops->end(); @@ -180,7 +192,7 @@ static void platform_suspend_end(suspend_state_t state) suspend_ops->end(); } -static void platform_suspend_recover(suspend_state_t state) +static void platform_recover(suspend_state_t state) { if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) suspend_ops->recover(); @@ -265,13 +277,22 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (error) goto Platform_finish; - error = dpm_suspend_end(PMSG_SUSPEND); + error = dpm_suspend_late(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: Some devices failed to power down\n"); + printk(KERN_ERR "PM: late suspend of devices failed\n"); goto Platform_finish; } error = platform_suspend_prepare_late(state); if (error) + goto Devices_early_resume; + + error = dpm_suspend_noirq(PMSG_SUSPEND); + if (error) { + printk(KERN_ERR "PM: noirq suspend of devices failed\n"); + goto Platform_early_resume; + } + error = platform_suspend_prepare_noirq(state); + if (error) goto Platform_wake; if (suspend_test(TEST_PLATFORM)) @@ -318,11 +339,17 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) enable_nonboot_cpus(); Platform_wake: - platform_suspend_wake(state); - dpm_resume_start(PMSG_RESUME); + platform_resume_noirq(state); + dpm_resume_noirq(PMSG_RESUME); + + Platform_early_resume: + platform_resume_early(state); + + Devices_early_resume: + dpm_resume_early(PMSG_RESUME); Platform_finish: - platform_suspend_finish(state); + platform_resume_finish(state); return error; } @@ -361,14 +388,16 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); + trace_suspend_resume(TPS("resume_console"), state, true); resume_console(); + trace_suspend_resume(TPS("resume_console"), state, false); Close: - platform_suspend_end(state); + platform_resume_end(state); return error; Recover_platform: - platform_suspend_recover(state); + platform_recover(state); goto Resume_devices; } diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index bd91bc177c93..084452e34a12 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -22,6 +22,8 @@ #define TEST_SUSPEND_SECONDS 10 static unsigned long suspend_test_start_time; +static u32 test_repeat_count_max = 1; +static u32 test_repeat_count_current; void suspend_test_start(void) { @@ -74,6 +76,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) int status; /* this may fail if the RTC hasn't been initialized */ +repeat: status = rtc_read_time(rtc, &alm.time); if (status < 0) { printk(err_readtime, dev_name(&rtc->dev), status); @@ -100,10 +103,21 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) if (state == PM_SUSPEND_STANDBY) { printk(info_test, pm_states[state]); status = pm_suspend(state); + if (status < 0) + state = PM_SUSPEND_FREEZE; } + if (state == PM_SUSPEND_FREEZE) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + } + if (status < 0) printk(err_suspend, status); + test_repeat_count_current++; + if (test_repeat_count_current < test_repeat_count_max) + goto repeat; + /* Some platforms can't detect that the alarm triggered the * wakeup, or (accordingly) disable it after it afterwards. * It's supposed to give oneshot behavior; cope. @@ -137,16 +151,28 @@ static char warn_bad_state[] __initdata = static int __init setup_test_suspend(char *value) { int i; + char *repeat; + char *suspend_type; - /* "=mem" ==> "mem" */ + /* example : "=mem[,N]" ==> "mem[,N]" */ value++; + suspend_type = strsep(&value, ","); + if (!suspend_type) + return 0; + + repeat = strsep(&value, ","); + if (repeat) { + if (kstrtou32(repeat, 0, &test_repeat_count_max)) + return 0; + } + for (i = 0; pm_labels[i]; i++) - if (!strcmp(pm_labels[i], value)) { + if (!strcmp(pm_labels[i], suspend_type)) { test_state_label = pm_labels[i]; return 0; } - printk(warn_bad_state, value); + printk(warn_bad_state, suspend_type); return 0; } __setup("test_suspend", setup_test_suspend); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index aaa3261dea5d..570aff817543 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -30,6 +30,7 @@ #include <linux/atomic.h> #include <linux/kthread.h> #include <linux/crc32.h> +#include <linux/ktime.h> #include "power.h" @@ -445,8 +446,8 @@ static int save_image(struct swap_map_handle *handle, int nr_pages; int err2; struct bio *bio; - struct timeval start; - struct timeval stop; + ktime_t start; + ktime_t stop; printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", nr_to_write); @@ -455,7 +456,7 @@ static int save_image(struct swap_map_handle *handle, m = 1; nr_pages = 0; bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); while (1) { ret = snapshot_read_next(snapshot); if (ret <= 0) @@ -469,12 +470,12 @@ static int save_image(struct swap_map_handle *handle, nr_pages++; } err2 = hib_wait_on_bio_chain(&bio); - do_gettimeofday(&stop); + stop = ktime_get(); if (!ret) ret = err2; if (!ret) printk(KERN_INFO "PM: Image saving done.\n"); - swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); return ret; } @@ -580,8 +581,8 @@ static int save_image_lzo(struct swap_map_handle *handle, int nr_pages; int err2; struct bio *bio; - struct timeval start; - struct timeval stop; + ktime_t start; + ktime_t stop; size_t off; unsigned thr, run_threads, nr_threads; unsigned char *page = NULL; @@ -674,7 +675,7 @@ static int save_image_lzo(struct swap_map_handle *handle, m = 1; nr_pages = 0; bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); for (;;) { for (thr = 0; thr < nr_threads; thr++) { for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { @@ -759,12 +760,12 @@ static int save_image_lzo(struct swap_map_handle *handle, out_finish: err2 = hib_wait_on_bio_chain(&bio); - do_gettimeofday(&stop); + stop = ktime_get(); if (!ret) ret = err2; if (!ret) printk(KERN_INFO "PM: Image saving done.\n"); - swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); out_clean: if (crc) { if (crc->thr) @@ -965,8 +966,8 @@ static int load_image(struct swap_map_handle *handle, { unsigned int m; int ret = 0; - struct timeval start; - struct timeval stop; + ktime_t start; + ktime_t stop; struct bio *bio; int err2; unsigned nr_pages; @@ -978,7 +979,7 @@ static int load_image(struct swap_map_handle *handle, m = 1; nr_pages = 0; bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); for ( ; ; ) { ret = snapshot_write_next(snapshot); if (ret <= 0) @@ -996,7 +997,7 @@ static int load_image(struct swap_map_handle *handle, nr_pages++; } err2 = hib_wait_on_bio_chain(&bio); - do_gettimeofday(&stop); + stop = ktime_get(); if (!ret) ret = err2; if (!ret) { @@ -1005,7 +1006,7 @@ static int load_image(struct swap_map_handle *handle, if (!snapshot_image_loaded(snapshot)) ret = -ENODATA; } - swsusp_show_speed(&start, &stop, nr_to_read, "Read"); + swsusp_show_speed(start, stop, nr_to_read, "Read"); return ret; } @@ -1067,8 +1068,8 @@ static int load_image_lzo(struct swap_map_handle *handle, int ret = 0; int eof = 0; struct bio *bio; - struct timeval start; - struct timeval stop; + ktime_t start; + ktime_t stop; unsigned nr_pages; size_t off; unsigned i, thr, run_threads, nr_threads; @@ -1190,7 +1191,7 @@ static int load_image_lzo(struct swap_map_handle *handle, m = 1; nr_pages = 0; bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); ret = snapshot_write_next(snapshot); if (ret <= 0) @@ -1343,7 +1344,7 @@ out_finish: wait_event(crc->done, atomic_read(&crc->stop)); atomic_set(&crc->stop, 0); } - do_gettimeofday(&stop); + stop = ktime_get(); if (!ret) { printk(KERN_INFO "PM: Image loading done.\n"); snapshot_write_finalize(snapshot); @@ -1359,7 +1360,7 @@ out_finish: } } } - swsusp_show_speed(&start, &stop, nr_to_read, "Read"); + swsusp_show_speed(start, stop, nr_to_read, "Read"); out_clean: for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); @@ -1374,7 +1375,7 @@ out_clean: kthread_stop(data[thr].thr); vfree(data); } - if (page) vfree(page); + vfree(page); return ret; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1ce770687ea8..02d6b6d28796 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -62,9 +62,6 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; -/* Deferred messaged from sched code are marked by this special level */ -#define SCHED_MESSAGE_LOGLEVEL -2 - /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -267,7 +264,6 @@ static u32 clear_idx; #define LOG_ALIGN __alignof__(struct printk_log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) -#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; @@ -481,7 +477,7 @@ static int syslog_action_restricted(int type) type != SYSLOG_ACTION_SIZE_BUFFER; } -static int check_syslog_permissions(int type, bool from_file) +int check_syslog_permissions(int type, bool from_file) { /* * If this is from /proc/kmsg and we've already opened it, then we've @@ -519,14 +515,13 @@ struct devkmsg_user { char buf[8192]; }; -static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, - unsigned long count, loff_t pos) +static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; int i; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ - size_t len = iov_length(iv, count); + size_t len = iocb->ki_nbytes; ssize_t ret = len; if (len > LOG_LINE_MAX) @@ -535,13 +530,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, if (buf == NULL) return -ENOMEM; - line = buf; - for (i = 0; i < count; i++) { - if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) { - ret = -EFAULT; - goto out; - } - line += iv[i].iov_len; + buf[len] = '\0'; + if (copy_from_iter(buf, len, from) != len) { + kfree(buf); + return -EFAULT; } /* @@ -567,10 +559,8 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, line = endp; } } - line[len] = '\0'; printk_emit(facility, level, NULL, 0, "%s", line); -out: kfree(buf); return ret; } @@ -802,7 +792,7 @@ static int devkmsg_release(struct inode *inode, struct file *file) const struct file_operations kmsg_fops = { .open = devkmsg_open, .read = devkmsg_read, - .aio_write = devkmsg_writev, + .write_iter = devkmsg_write, .llseek = devkmsg_llseek, .poll = devkmsg_poll, .release = devkmsg_release, @@ -858,6 +848,9 @@ static int __init log_buf_len_setup(char *str) } early_param("log_buf_len", log_buf_len_setup); +#ifdef CONFIG_SMP +#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) + static void __init log_buf_add_cpu(void) { unsigned int cpu_extra; @@ -884,6 +877,9 @@ static void __init log_buf_add_cpu(void) log_buf_len_update(cpu_extra + __LOG_BUF_LEN); } +#else /* !CONFIG_SMP */ +static inline void log_buf_add_cpu(void) {} +#endif /* CONFIG_SMP */ void __init setup_log_buf(int early) { @@ -1260,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) int do_syslog(int type, char __user *buf, int len, bool from_file) { bool clear = false; - static int saved_console_loglevel = -1; + static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; error = check_syslog_permissions(type, from_file); @@ -1317,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: - if (saved_console_loglevel == -1) + if (saved_console_loglevel == LOGLEVEL_DEFAULT) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; /* Enable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: - if (saved_console_loglevel != -1) { + if (saved_console_loglevel != LOGLEVEL_DEFAULT) { console_loglevel = saved_console_loglevel; - saved_console_loglevel = -1; + saved_console_loglevel = LOGLEVEL_DEFAULT; } break; /* Set level of messages printed to console */ @@ -1337,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) len = minimum_console_loglevel; console_loglevel = len; /* Implicitly re-enable logging to console */ - saved_console_loglevel = -1; + saved_console_loglevel = LOGLEVEL_DEFAULT; error = 0; break; /* Number of chars in the log buffer */ @@ -1628,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level, int printed_len = 0; bool in_sched = false; /* cpu currently holding logbuf_lock in this function */ - static volatile unsigned int logbuf_cpu = UINT_MAX; + static unsigned int logbuf_cpu = UINT_MAX; - if (level == SCHED_MESSAGE_LOGLEVEL) { - level = -1; + if (level == LOGLEVEL_SCHED) { + level = LOGLEVEL_DEFAULT; in_sched = true; } @@ -1680,12 +1676,7 @@ asmlinkage int vprintk_emit(int facility, int level, * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. */ - if (in_sched) - text_len = scnprintf(text, sizeof(textbuf), - KERN_WARNING "[sched_delayed] "); - - text_len += vscnprintf(text + text_len, - sizeof(textbuf) - text_len, fmt, args); + text_len = vscnprintf(text, sizeof(textbuf), fmt, args); /* mark and strip a trailing newline */ if (text_len && text[text_len-1] == '\n') { @@ -1701,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level, const char *end_of_header = printk_skip_level(text); switch (kern_level) { case '0' ... '7': - if (level == -1) + if (level == LOGLEVEL_DEFAULT) level = kern_level - '0'; + /* fallthrough */ case 'd': /* KERN_DEFAULT */ lflags |= LOG_PREFIX; } @@ -1716,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level, } } - if (level == -1) + if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; if (dict) @@ -1794,7 +1786,7 @@ EXPORT_SYMBOL(vprintk_emit); asmlinkage int vprintk(const char *fmt, va_list args) { - return vprintk_emit(0, -1, NULL, 0, fmt, args); + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); } EXPORT_SYMBOL(vprintk); @@ -1813,6 +1805,30 @@ asmlinkage int printk_emit(int facility, int level, } EXPORT_SYMBOL(printk_emit); +int vprintk_default(const char *fmt, va_list args) +{ + int r; + +#ifdef CONFIG_KGDB_KDB + if (unlikely(kdb_trap_printk)) { + r = vkdb_printf(fmt, args); + return r; + } +#endif + r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + + return r; +} +EXPORT_SYMBOL_GPL(vprintk_default); + +/* + * This allows printk to be diverted to another function per cpu. + * This is useful for calling printk functions from within NMI + * without worrying about race conditions that can lock up the + * box. + */ +DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; + /** * printk - print a kernel message * @fmt: format string @@ -1836,19 +1852,21 @@ EXPORT_SYMBOL(printk_emit); */ asmlinkage __visible int printk(const char *fmt, ...) { + printk_func_t vprintk_func; va_list args; int r; -#ifdef CONFIG_KGDB_KDB - if (unlikely(kdb_trap_printk)) { - va_start(args, fmt); - r = vkdb_printf(fmt, args); - va_end(args); - return r; - } -#endif va_start(args, fmt); - r = vprintk_emit(0, -1, NULL, 0, fmt, args); + + /* + * If a caller overrides the per_cpu printk_func, then it needs + * to disable preemption when calling printk(). Otherwise + * the printk_func should be set to the default. No need to + * disable preemption here. + */ + vprintk_func = this_cpu_read(printk_func); + r = vprintk_func(fmt, args); + va_end(args); return r; @@ -1882,28 +1900,28 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, bool syslog, char *buf, size_t size) { return 0; } static size_t cont_print_text(char *text, size_t size) { return 0; } +/* Still needs to be defined for users */ +DEFINE_PER_CPU(printk_func_t, printk_func); + #endif /* CONFIG_PRINTK */ #ifdef CONFIG_EARLY_PRINTK struct console *early_console; -void early_vprintk(const char *fmt, va_list ap) -{ - if (early_console) { - char buf[512]; - int n = vscnprintf(buf, sizeof(buf), fmt, ap); - - early_console->write(early_console, buf, n); - } -} - asmlinkage __visible void early_printk(const char *fmt, ...) { va_list ap; + char buf[512]; + int n; + + if (!early_console) + return; va_start(ap, fmt); - early_vprintk(fmt, ap); + n = vscnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); + + early_console->write(early_console, buf, n); } #endif @@ -2628,7 +2646,7 @@ void wake_up_klogd(void) preempt_disable(); if (waitqueue_active(&log_wait)) { this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); - irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } preempt_enable(); } @@ -2640,11 +2658,11 @@ int printk_deferred(const char *fmt, ...) preempt_disable(); va_start(args, fmt); - r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); + r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); va_end(args); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); - irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); preempt_enable(); return r; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 54e75226c2c4..1eb9d90c3af9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) /* * Detach all tasks we were using ptrace on. Called with tasklist held - * for writing, and returns with it held too. But note it can release - * and reacquire the lock. + * for writing. */ -void exit_ptrace(struct task_struct *tracer) - __releases(&tasklist_lock) - __acquires(&tasklist_lock) +void exit_ptrace(struct task_struct *tracer, struct list_head *dead) { struct task_struct *p, *n; - LIST_HEAD(ptrace_dead); - - if (likely(list_empty(&tracer->ptraced))) - return; list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { if (unlikely(p->ptrace & PT_EXITKILL)) send_sig_info(SIGKILL, SEND_SIG_FORCED, p); if (__ptrace_detach(tracer, p)) - list_add(&p->ptrace_entry, &ptrace_dead); - } - - write_unlock_irq(&tasklist_lock); - BUG_ON(!list_empty(&tracer->ptraced)); - - list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { - list_del_init(&p->ptrace_entry); - release_task(p); + list_add(&p->ptrace_entry, dead); } - - write_lock_irq(&tasklist_lock); } int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 807ccfbf69b3..e6fae503d1bc 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,6 +1,6 @@ obj-y += update.o srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o -obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o +obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o obj-$(CONFIG_TINY_RCU) += tiny.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ff1a6de62f17..07bb02eda844 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -135,4 +135,6 @@ int rcu_jiffies_till_stall_check(void); */ #define TPS(x) tracepoint_string(x) +void rcu_early_boot_tests(void); + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 948a7693748e..4d559baf06e0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -49,11 +49,19 @@ #include <linux/trace_clock.h> #include <asm/byteorder.h> #include <linux/torture.h> +#include <linux/vmalloc.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); +torture_param(int, cbflood_inter_holdoff, HZ, + "Holdoff between floods (jiffies)"); +torture_param(int, cbflood_intra_holdoff, 1, + "Holdoff between bursts (jiffies)"); +torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable"); +torture_param(int, cbflood_n_per_burst, 20000, + "# callbacks per burst in flood"); torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); @@ -96,10 +104,12 @@ module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); static int nrealreaders; +static int ncbflooders; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; +static struct task_struct **cbflood_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *stall_task; @@ -138,6 +148,7 @@ static long n_rcu_torture_boosts; static long n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; +static atomic_long_t n_cbfloods; static struct list_head rcu_torture_removed; static int rcu_torture_writer_state; @@ -157,9 +168,9 @@ static int rcu_torture_writer_state; #else #define RCUTORTURE_RUNNABLE_INIT 0 #endif -int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -module_param(rcutorture_runnable, int, 0444); -MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); +static int torture_runnable = RCUTORTURE_RUNNABLE_INIT; +module_param(torture_runnable, int, 0444); +MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) #define rcu_can_boost() 1 @@ -182,7 +193,7 @@ static u64 notrace rcu_trace_clock_local(void) #endif /* #else #ifdef CONFIG_RCU_TRACE */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ -DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ +static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ static bool barrier_phase; /* Test phase. */ @@ -242,7 +253,7 @@ struct rcu_torture_ops { void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); - void (*stats)(char *page); + void (*stats)(void); int irq_capable; int can_boost; const char *name; @@ -525,21 +536,21 @@ static void srcu_torture_barrier(void) srcu_barrier(&srcu_ctl); } -static void srcu_torture_stats(char *page) +static void srcu_torture_stats(void) { int cpu; int idx = srcu_ctl.completed & 0x1; - page += sprintf(page, "%s%s per-CPU(idx=%d):", - torture_type, TORTURE_FLAG, idx); + pr_alert("%s%s per-CPU(idx=%d):", + torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { long c0, c1; c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; - page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1); + pr_cont(" %d(%ld,%ld)", cpu, c0, c1); } - sprintf(page, "\n"); + pr_cont("\n"); } static void srcu_torture_synchronize_expedited(void) @@ -601,6 +612,52 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; +#ifdef CONFIG_TASKS_RCU + +/* + * Definitions for RCU-tasks torture testing. + */ + +static int tasks_torture_read_lock(void) +{ + return 0; +} + +static void tasks_torture_read_unlock(int idx) +{ +} + +static void rcu_tasks_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_ops = { + .ttype = RCU_TASKS_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = tasks_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = tasks_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_tasks_torture_deferred_free, + .sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_tasks, + .call = call_rcu_tasks, + .cb_barrier = rcu_barrier_tasks, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "tasks" +}; + +#define RCUTORTURE_TASKS_OPS &tasks_ops, + +#else /* #ifdef CONFIG_TASKS_RCU */ + +#define RCUTORTURE_TASKS_OPS + +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + /* * RCU torture priority-boost testing. Runs one real-time thread per * CPU for moderate bursts, repeatedly registering RCU callbacks and @@ -667,7 +724,7 @@ static int rcu_torture_boost(void *arg) } call_rcu_time = jiffies; } - cond_resched(); + cond_resched_rcu_qs(); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) goto checkwait; @@ -707,6 +764,59 @@ checkwait: stutter_wait("rcu_torture_boost"); return 0; } +static void rcu_torture_cbflood_cb(struct rcu_head *rhp) +{ +} + +/* + * RCU torture callback-flood kthread. Repeatedly induces bursts of calls + * to call_rcu() or analogous, increasing the probability of occurrence + * of callback-overflow corner cases. + */ +static int +rcu_torture_cbflood(void *arg) +{ + int err = 1; + int i; + int j; + struct rcu_head *rhp; + + if (cbflood_n_per_burst > 0 && + cbflood_inter_holdoff > 0 && + cbflood_intra_holdoff > 0 && + cur_ops->call && + cur_ops->cb_barrier) { + rhp = vmalloc(sizeof(*rhp) * + cbflood_n_burst * cbflood_n_per_burst); + err = !rhp; + } + if (err) { + VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); + while (!torture_must_stop()) + schedule_timeout_interruptible(HZ); + return 0; + } + VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); + do { + schedule_timeout_interruptible(cbflood_inter_holdoff); + atomic_long_inc(&n_cbfloods); + WARN_ON(signal_pending(current)); + for (i = 0; i < cbflood_n_burst; i++) { + for (j = 0; j < cbflood_n_per_burst; j++) { + cur_ops->call(&rhp[i * cbflood_n_per_burst + j], + rcu_torture_cbflood_cb); + } + schedule_timeout_interruptible(cbflood_intra_holdoff); + WARN_ON(signal_pending(current)); + } + cur_ops->cb_barrier(); + stutter_wait("rcu_torture_cbflood"); + } while (!torture_must_stop()); + vfree(rhp); + torture_kthread_stopping("rcu_torture_cbflood"); + return 0; +} + /* * RCU torture force-quiescent-state kthread. Repeatedly induces * bursts of calls to force_quiescent_state(), increasing the probability @@ -1019,7 +1129,7 @@ rcu_torture_reader(void *arg) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); - cond_resched(); + cond_resched_rcu_qs(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { @@ -1031,10 +1141,15 @@ rcu_torture_reader(void *arg) } /* - * Create an RCU-torture statistics message in the specified buffer. + * Print torture statistics. Caller must ensure that there is only + * one call to this function at a given time!!! This is normally + * accomplished by relying on the module system to only have one copy + * of the module loaded, and then by giving the rcu_torture_stats + * kthread full control (or the init/cleanup functions when rcu_torture_stats + * thread is not running). */ static void -rcu_torture_printk(char *page) +rcu_torture_stats_print(void) { int cpu; int i; @@ -1052,55 +1167,61 @@ rcu_torture_printk(char *page) if (pipesummary[i] != 0) break; } - page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, - "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", - rcu_torture_current, - rcu_torture_current_version, - list_empty(&rcu_torture_freelist), - atomic_read(&n_rcu_torture_alloc), - atomic_read(&n_rcu_torture_alloc_fail), - atomic_read(&n_rcu_torture_free)); - page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", - atomic_read(&n_rcu_torture_mberror), - n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror); - page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", - n_rcu_torture_boost_failure, - n_rcu_torture_boosts, - n_rcu_torture_timers); - page = torture_onoff_stats(page); - page += sprintf(page, "barrier: %ld/%ld:%ld", - n_barrier_successes, - n_barrier_attempts, - n_rcu_torture_barrier_error); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); + pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", + rcu_torture_current, + rcu_torture_current_version, + list_empty(&rcu_torture_freelist), + atomic_read(&n_rcu_torture_alloc), + atomic_read(&n_rcu_torture_alloc_fail), + atomic_read(&n_rcu_torture_free)); + pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ", + atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_boost_ktrerror, + n_rcu_torture_boost_rterror); + pr_cont("rtbf: %ld rtb: %ld nt: %ld ", + n_rcu_torture_boost_failure, + n_rcu_torture_boosts, + n_rcu_torture_timers); + torture_onoff_stats(); + pr_cont("barrier: %ld/%ld:%ld ", + n_barrier_successes, + n_barrier_attempts, + n_rcu_torture_barrier_error); + pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods)); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) != 0 || n_rcu_torture_barrier_error != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || n_rcu_torture_boost_failure != 0 || i > 1) { - page += sprintf(page, "!!! "); + pr_cont("%s", "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(1); } - page += sprintf(page, "Reader Pipe: "); + pr_cont("Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - page += sprintf(page, " %ld", pipesummary[i]); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, "Reader Batch: "); + pr_cont(" %ld", pipesummary[i]); + pr_cont("\n"); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); + pr_cont("Reader Batch: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - page += sprintf(page, " %ld", batchsummary[i]); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, "Free-Block Circulation: "); + pr_cont(" %ld", batchsummary[i]); + pr_cont("\n"); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); + pr_cont("Free-Block Circulation: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - page += sprintf(page, " %d", - atomic_read(&rcu_torture_wcount[i])); + pr_cont(" %d", atomic_read(&rcu_torture_wcount[i])); } - page += sprintf(page, "\n"); + pr_cont("\n"); + if (cur_ops->stats) - cur_ops->stats(page); + cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && rcu_torture_current != NULL) { int __maybe_unused flags; @@ -1109,10 +1230,9 @@ rcu_torture_printk(char *page) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - page += sprintf(page, - "??? Writer stall state %d g%lu c%lu f%#x\n", - rcu_torture_writer_state, - gpnum, completed, flags); + pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n", + rcu_torture_writer_state, + gpnum, completed, flags); show_rcu_gp_kthreads(); rcutorture_trace_dump(); } @@ -1120,30 +1240,6 @@ rcu_torture_printk(char *page) } /* - * Print torture statistics. Caller must ensure that there is only - * one call to this function at a given time!!! This is normally - * accomplished by relying on the module system to only have one copy - * of the module loaded, and then by giving the rcu_torture_stats - * kthread full control (or the init/cleanup functions when rcu_torture_stats - * thread is not running). - */ -static void -rcu_torture_stats_print(void) -{ - int size = nr_cpu_ids * 200 + 8192; - char *buf; - - buf = kmalloc(size, GFP_KERNEL); - if (!buf) { - pr_err("rcu-torture: Out of memory, need: %d", size); - return; - } - rcu_torture_printk(buf); - pr_alert("%s", buf); - kfree(buf); -} - -/* * Periodically prints torture statistics, if periodic statistics printing * was specified via the stat_interval module parameter. */ @@ -1295,7 +1391,8 @@ static int rcu_torture_barrier_cbs(void *arg) if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); - cur_ops->cb_barrier(); + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); destroy_rcu_head_on_stack(&rcu); torture_kthread_stopping("rcu_torture_barrier_cbs"); return 0; @@ -1418,7 +1515,7 @@ rcu_torture_cleanup(void) int i; rcutorture_record_test_transition(); - if (torture_cleanup()) { + if (torture_cleanup_begin()) { if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); return; @@ -1447,6 +1544,8 @@ rcu_torture_cleanup(void) torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); + for (i = 0; i < ncbflooders; i++) + torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); if ((test_boost == 1 && cur_ops->can_boost) || test_boost == 2) { unregister_cpu_notifier(&rcutorture_cpu_nb); @@ -1468,6 +1567,7 @@ rcu_torture_cleanup(void) "End of test: RCU_HOTPLUG"); else rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); + torture_cleanup_end(); } #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD @@ -1534,9 +1634,10 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, + RCUTORTURE_TASKS_OPS }; - if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) + if (!torture_init_begin(torture_type, verbose, &torture_runnable)) return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ @@ -1693,6 +1794,24 @@ rcu_torture_init(void) goto unwind; if (object_debug) rcu_test_debug_objects(); + if (cbflood_n_burst > 0) { + /* Create the cbflood threads */ + ncbflooders = (num_online_cpus() + 3) / 4; + cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task), + GFP_KERNEL); + if (!cbflood_task) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < ncbflooders; i++) { + firsterr = torture_create_kthread(rcu_torture_cbflood, + NULL, + cbflood_task[i]); + if (firsterr) + goto unwind; + } + } rcutorture_record_test_transition(); torture_init_end(); return 0; diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index d9efcc13008c..0db5649f8817 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -51,7 +51,7 @@ static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; #include "tiny_plugin.h" -/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ +/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */ static void rcu_idle_enter_common(long long newval) { if (newval) { @@ -62,7 +62,7 @@ static void rcu_idle_enter_common(long long newval) } RCU_TRACE(trace_rcu_dyntick(TPS("Start"), rcu_dynticks_nesting, newval)); - if (!is_idle_task(current)) { + if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), @@ -72,7 +72,7 @@ static void rcu_idle_enter_common(long long newval) current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ } - rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ + rcu_sched_qs(); /* implies rcu_bh_inc() */ barrier(); rcu_dynticks_nesting = newval; } @@ -114,7 +114,7 @@ void rcu_irq_exit(void) } EXPORT_SYMBOL_GPL(rcu_irq_exit); -/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ +/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */ static void rcu_idle_exit_common(long long oldval) { if (oldval) { @@ -123,7 +123,7 @@ static void rcu_idle_exit_common(long long oldval) return; } RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); - if (!is_idle_task(current)) { + if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), @@ -217,7 +217,7 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) * are at it, given that any rcu quiescent state is also an rcu_bh * quiescent state. Use "+" instead of "||" to defeat short circuiting. */ -void rcu_sched_qs(int cpu) +void rcu_sched_qs(void) { unsigned long flags; @@ -231,7 +231,7 @@ void rcu_sched_qs(int cpu) /* * Record an rcu_bh quiescent state. */ -void rcu_bh_qs(int cpu) +void rcu_bh_qs(void) { unsigned long flags; @@ -247,13 +247,15 @@ void rcu_bh_qs(int cpu) * be called from hardirq context. It is normally called from the * scheduling-clock interrupt. */ -void rcu_check_callbacks(int cpu, int user) +void rcu_check_callbacks(int user) { RCU_TRACE(check_cpu_stalls()); if (user || rcu_is_cpu_rrupt_from_idle()) - rcu_sched_qs(cpu); + rcu_sched_qs(); else if (!in_softirq()) - rcu_bh_qs(cpu); + rcu_bh_qs(); + if (user) + rcu_note_voluntary_context_switch(current); } /* @@ -378,7 +380,9 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); -void rcu_init(void) +void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + + rcu_early_boot_tests(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1b70cb6fbe3c..7680fc275036 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -79,9 +79,18 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; * the tracing userspace tools to be able to decipher the string * address to the matching string. */ -#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ +#ifdef CONFIG_TRACING +# define DEFINE_RCU_TPS(sname) \ static char sname##_varname[] = #sname; \ -static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ +static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; +# define RCU_STATE_NAME(sname) sname##_varname +#else +# define DEFINE_RCU_TPS(sname) +# define RCU_STATE_NAME(sname) __stringify(sname) +#endif + +#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ +DEFINE_RCU_TPS(sname) \ struct rcu_state sname##_state = { \ .level = { &sname##_state.node[0] }, \ .call = cr, \ @@ -93,10 +102,10 @@ struct rcu_state sname##_state = { \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ - .name = sname##_varname, \ + .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ }; \ -DEFINE_PER_CPU(struct rcu_data, sname##_data) +DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data) RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); @@ -143,19 +152,6 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); */ static int rcu_scheduler_fully_active __read_mostly; -#ifdef CONFIG_RCU_BOOST - -/* - * Control variables for per-CPU and per-rcu_node kthreads. These - * handle all flavors of RCU. - */ -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DEFINE_PER_CPU(char, rcu_cpu_has_work); - -#endif /* #ifdef CONFIG_RCU_BOOST */ - static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); @@ -188,22 +184,24 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) * one since the start of the grace period, this just sets a flag. * The caller must have disabled preemption. */ -void rcu_sched_qs(int cpu) +void rcu_sched_qs(void) { - struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; + if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { + trace_rcu_grace_period(TPS("rcu_sched"), + __this_cpu_read(rcu_sched_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_sched_data.passed_quiesce, 1); + } } -void rcu_bh_qs(int cpu) +void rcu_bh_qs(void) { - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; + if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { + trace_rcu_grace_period(TPS("rcu_bh"), + __this_cpu_read(rcu_bh_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_bh_data.passed_quiesce, 1); + } } static DEFINE_PER_CPU(int, rcu_sched_qs_mask); @@ -275,11 +273,11 @@ static void rcu_momentary_dyntick_idle(void) * and requires special handling for preemptible RCU. * The caller must have disabled preemption. */ -void rcu_note_context_switch(int cpu) +void rcu_note_context_switch(void) { trace_rcu_utilization(TPS("Start context switch")); - rcu_sched_qs(cpu); - rcu_preempt_note_context_switch(cpu); + rcu_sched_qs(); + rcu_preempt_note_context_switch(); if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); trace_rcu_utilization(TPS("End context switch")); @@ -314,7 +312,7 @@ static void force_qs_rnp(struct rcu_state *rsp, unsigned long *maxj), bool *isidle, unsigned long *maxj); static void force_quiescent_state(struct rcu_state *rsp); -static int rcu_pending(int cpu); +static int rcu_pending(void); /* * Return the number of RCU-sched batches processed thus far for debug & stats. @@ -499,11 +497,11 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) * we really have entered idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, - bool user) +static void rcu_eqs_enter_common(long long oldval, bool user) { struct rcu_state *rsp; struct rcu_data *rdp; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { @@ -520,12 +518,13 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, rdp = this_cpu_ptr(rsp->rda); do_nocb_deferred_wakeup(rdp); } - rcu_prepare_for_idle(smp_processor_id()); + rcu_prepare_for_idle(); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); smp_mb__after_atomic(); /* Force ordering with next sojourn. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + rcu_dynticks_task_enter(); /* * It is illegal to enter an extended quiescent state while @@ -553,7 +552,7 @@ static void rcu_eqs_enter(bool user) WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { rdtp->dynticks_nesting = 0; - rcu_eqs_enter_common(rdtp, oldval, user); + rcu_eqs_enter_common(oldval, user); } else { rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; } @@ -577,7 +576,7 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); - rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); + rcu_sysidle_enter(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -627,8 +626,8 @@ void rcu_irq_exit(void) if (rdtp->dynticks_nesting) trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); else - rcu_eqs_enter_common(rdtp, oldval, true); - rcu_sysidle_enter(rdtp, 1); + rcu_eqs_enter_common(oldval, true); + rcu_sysidle_enter(1); local_irq_restore(flags); } @@ -639,15 +638,17 @@ void rcu_irq_exit(void) * we really have exited idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, - int user) +static void rcu_eqs_exit_common(long long oldval, int user) { + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + rcu_dynticks_task_exit(); smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); - rcu_cleanup_after_idle(smp_processor_id()); + rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = @@ -678,7 +679,7 @@ static void rcu_eqs_exit(bool user) rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; } else { rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_eqs_exit_common(rdtp, oldval, user); + rcu_eqs_exit_common(oldval, user); } } @@ -699,7 +700,7 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); - rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); + rcu_sysidle_exit(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -750,8 +751,8 @@ void rcu_irq_enter(void) if (oldval) trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else - rcu_eqs_exit_common(rdtp, oldval, true); - rcu_sysidle_exit(rdtp, 1); + rcu_eqs_exit_common(oldval, true); + rcu_sysidle_exit(1); local_irq_restore(flags); } @@ -819,7 +820,7 @@ bool notrace __rcu_is_watching(void) */ bool notrace rcu_is_watching(void) { - int ret; + bool ret; preempt_disable(); ret = __rcu_is_watching(); @@ -1647,7 +1648,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); - cond_resched(); + cond_resched_rcu_qs(); } mutex_unlock(&rsp->onoff_mutex); @@ -1668,7 +1669,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ if (is_sysidle_rcu_state(rsp)) { - isidle = 1; + isidle = true; maxj = jiffies - ULONG_MAX / 4; } force_qs_rnp(rsp, dyntick_save_progress_counter, @@ -1677,14 +1678,15 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ - isidle = 0; + isidle = false; force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); } /* Clear flag to prevent immediate re-entry. */ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); - ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS; + ACCESS_ONCE(rsp->gp_flags) = + ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS; raw_spin_unlock_irq(&rnp->lock); } return fqs_state; @@ -1736,7 +1738,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); - cond_resched(); + cond_resched_rcu_qs(); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); @@ -1785,8 +1787,8 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; - cond_resched(); - flush_signals(current); + cond_resched_rcu_qs(); + WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("reqwaitsig")); @@ -1828,11 +1830,11 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("fqsend")); - cond_resched(); + cond_resched_rcu_qs(); } else { /* Deal with stray signal. */ - cond_resched(); - flush_signals(current); + cond_resched_rcu_qs(); + WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("fqswaitsig")); @@ -1928,7 +1930,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); - wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ + rcu_gp_kthread_wake(rsp); } /* @@ -2210,8 +2212,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); - /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ - /* Exclude any attempts to start a new grace period. */ mutex_lock(&rsp->onoff_mutex); raw_spin_lock_irqsave(&rsp->orphan_lock, flags); @@ -2375,7 +2375,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * invoked from the scheduling-clock interrupt. If rcu_pending returns * false, there is no point in invoking rcu_check_callbacks(). */ -void rcu_check_callbacks(int cpu, int user) +void rcu_check_callbacks(int user) { trace_rcu_utilization(TPS("Start scheduler-tick")); increment_cpu_stall_ticks(); @@ -2393,8 +2393,8 @@ void rcu_check_callbacks(int cpu, int user) * at least not while the corresponding CPU is online. */ - rcu_sched_qs(cpu); - rcu_bh_qs(cpu); + rcu_sched_qs(); + rcu_bh_qs(); } else if (!in_softirq()) { @@ -2405,11 +2405,13 @@ void rcu_check_callbacks(int cpu, int user) * critical section, so note it. */ - rcu_bh_qs(cpu); + rcu_bh_qs(); } - rcu_preempt_check_callbacks(cpu); - if (rcu_pending(cpu)) + rcu_preempt_check_callbacks(); + if (rcu_pending()) invoke_rcu_core(); + if (user) + rcu_note_voluntary_context_switch(current); trace_rcu_utilization(TPS("End scheduler-tick")); } @@ -2432,7 +2434,7 @@ static void force_qs_rnp(struct rcu_state *rsp, struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - cond_resched(); + cond_resched_rcu_qs(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); @@ -2449,7 +2451,7 @@ static void force_qs_rnp(struct rcu_state *rsp, for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { if ((rnp->qsmask & bit) != 0) { if ((rnp->qsmaskinit & bit) != 0) - *isidle = 0; + *isidle = false; if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) mask |= bit; } @@ -2505,9 +2507,10 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ } - ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS; + ACCESS_ONCE(rsp->gp_flags) = + ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); - wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ + rcu_gp_kthread_wake(rsp); } /* @@ -2925,11 +2928,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data) * restructure your code to batch your updates, and then use a single * synchronize_sched() instead. * - * Note that it is illegal to call this function while holding any lock - * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal - * to call this function from a CPU-hotplug notifier. Failing to observe - * these restriction will result in deadlock. - * * This implementation can be thought of as an application of ticket * locking to RCU, with sync_sched_expedited_started and * sync_sched_expedited_done taking on the roles of the halves @@ -2953,6 +2951,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data) */ void synchronize_sched_expedited(void) { + cpumask_var_t cm; + bool cma = false; + int cpu; long firstsnap, s, snap; int trycount = 0; struct rcu_state *rsp = &rcu_sched_state; @@ -2979,14 +2980,34 @@ void synchronize_sched_expedited(void) */ snap = atomic_long_inc_return(&rsp->expedited_start); firstsnap = snap; - get_online_cpus(); + if (!try_get_online_cpus()) { + /* CPU hotplug operation in flight, fall back to normal GP. */ + wait_rcu_gp(call_rcu_sched); + atomic_long_inc(&rsp->expedited_normal); + return; + } WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); + /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */ + cma = zalloc_cpumask_var(&cm, GFP_KERNEL); + if (cma) { + cpumask_copy(cm, cpu_online_mask); + cpumask_clear_cpu(raw_smp_processor_id(), cm); + for_each_cpu(cpu, cm) { + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + cpumask_clear_cpu(cpu, cm); + } + if (cpumask_weight(cm) == 0) + goto all_cpus_idle; + } + /* * Each pass through the following loop attempts to force a * context switch on each CPU. */ - while (try_stop_cpus(cpu_online_mask, + while (try_stop_cpus(cma ? cm : cpu_online_mask, synchronize_sched_expedited_cpu_stop, NULL) == -EAGAIN) { put_online_cpus(); @@ -2998,6 +3019,7 @@ void synchronize_sched_expedited(void) /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone1); + free_cpumask_var(cm); return; } @@ -3007,6 +3029,7 @@ void synchronize_sched_expedited(void) } else { wait_rcu_gp(call_rcu_sched); atomic_long_inc(&rsp->expedited_normal); + free_cpumask_var(cm); return; } @@ -3016,6 +3039,7 @@ void synchronize_sched_expedited(void) /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone2); + free_cpumask_var(cm); return; } @@ -3026,12 +3050,21 @@ void synchronize_sched_expedited(void) * and they started after our first try, so their grace * period works for us. */ - get_online_cpus(); + if (!try_get_online_cpus()) { + /* CPU hotplug operation in flight, use normal GP. */ + wait_rcu_gp(call_rcu_sched); + atomic_long_inc(&rsp->expedited_normal); + free_cpumask_var(cm); + return; + } snap = atomic_long_read(&rsp->expedited_start); smp_mb(); /* ensure read is before try_stop_cpus(). */ } atomic_long_inc(&rsp->expedited_stoppedcpus); +all_cpus_idle: + free_cpumask_var(cm); + /* * Everyone up to our most recent fetch is covered by our grace * period. Update the counter, but only if our work is still @@ -3123,12 +3156,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) * by the current CPU, returning 1 if so. This function is part of the * RCU implementation; it is -not- an exported member of the RCU API. */ -static int rcu_pending(int cpu) +static int rcu_pending(void) { struct rcu_state *rsp; for_each_rcu_flavor(rsp) - if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) + if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda))) return 1; return 0; } @@ -3138,7 +3171,7 @@ static int rcu_pending(int cpu) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) +static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) { bool al = true; bool hc = false; @@ -3146,7 +3179,7 @@ static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) struct rcu_state *rsp; for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = this_cpu_ptr(rsp->rda); if (!rdp->nxtlist) continue; hc = true; @@ -3279,11 +3312,16 @@ static void _rcu_barrier(struct rcu_state *rsp) continue; rdp = per_cpu_ptr(rsp->rda, cpu); if (rcu_is_nocb_cpu(cpu)) { - _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, - rsp->n_barrier_done); - atomic_inc(&rsp->barrier_cpu_count); - __call_rcu(&rdp->barrier_head, rcu_barrier_callback, - rsp, cpu, 0); + if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { + _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, + rsp->n_barrier_done); + } else { + _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + rsp->n_barrier_done); + atomic_inc(&rsp->barrier_cpu_count); + __call_rcu(&rdp->barrier_head, + rcu_barrier_callback, rsp, cpu, 0); + } } else if (ACCESS_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); @@ -3442,6 +3480,7 @@ static int rcu_cpu_notify(struct notifier_block *self, case CPU_UP_PREPARE_FROZEN: rcu_prepare_cpu(cpu); rcu_prepare_kthreads(cpu); + rcu_spawn_all_nocb_kthreads(cpu); break; case CPU_ONLINE: case CPU_DOWN_FAILED: @@ -3459,8 +3498,10 @@ static int rcu_cpu_notify(struct notifier_block *self, case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - for_each_rcu_flavor(rsp) + for_each_rcu_flavor(rsp) { rcu_cleanup_dead_cpu(cpu, rsp); + do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); + } break; default: break; @@ -3489,7 +3530,7 @@ static int rcu_pm_notify(struct notifier_block *self, } /* - * Spawn the kthread that handles this RCU flavor's grace periods. + * Spawn the kthreads that handle each RCU flavor's grace periods. */ static int __init rcu_spawn_gp_kthread(void) { @@ -3498,6 +3539,7 @@ static int __init rcu_spawn_gp_kthread(void) struct rcu_state *rsp; struct task_struct *t; + rcu_scheduler_fully_active = 1; for_each_rcu_flavor(rsp) { t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); BUG_ON(IS_ERR(t)); @@ -3505,8 +3547,9 @@ static int __init rcu_spawn_gp_kthread(void) raw_spin_lock_irqsave(&rnp->lock, flags); rsp->gp_kthread = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); - rcu_spawn_nocb_kthreads(rsp); } + rcu_spawn_nocb_kthreads(); + rcu_spawn_boost_kthreads(); return 0; } early_initcall(rcu_spawn_gp_kthread); @@ -3738,6 +3781,8 @@ void __init rcu_init(void) pm_notifier(rcu_pm_notify, 0); for_each_online_cpu(cpu) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); + + rcu_early_boot_tests(); } #include "tree_plugin.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6a86eb7bac45..8e7b1843896e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -139,7 +139,7 @@ struct rcu_node { unsigned long expmask; /* Groups that have ->blkd_tasks */ /* elements that need to drain to allow the */ /* current expedited grace period to */ - /* complete (only for TREE_PREEMPT_RCU). */ + /* complete (only for PREEMPT_RCU). */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask & expmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ @@ -350,7 +350,7 @@ struct rcu_data { int nocb_p_count_lazy; /* (approximate). */ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; - bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ + int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ /* The following fields are used by the leader, hence own cacheline. */ struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; @@ -383,6 +383,11 @@ struct rcu_data { #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK +/* Values for nocb_defer_wakeup field in struct rcu_data. */ +#define RCU_NOGP_WAKE_NOT 0 +#define RCU_NOGP_WAKE 1 +#define RCU_NOGP_WAKE_FORCE 2 + #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) /* For jiffies_till_first_fqs and */ /* and jiffies_till_next_fqs. */ @@ -525,10 +530,10 @@ DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); extern struct rcu_state rcu_bh_state; DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); -#ifdef CONFIG_TREE_PREEMPT_RCU +#ifdef CONFIG_PREEMPT_RCU extern struct rcu_state rcu_preempt_state; DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); @@ -542,7 +547,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); long rcu_batches_completed(void); -static void rcu_preempt_note_context_switch(int cpu); +static void rcu_preempt_note_context_switch(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, @@ -556,12 +561,12 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_preempt_check_callbacks(int cpu); +static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); -#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ +#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */ static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); @@ -572,15 +577,17 @@ static void rcu_preempt_do_callbacks(void); static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ +static void __init rcu_spawn_boost_kthreads(void); static void rcu_prepare_kthreads(int cpu); -static void rcu_cleanup_after_idle(int cpu); -static void rcu_prepare_for_idle(int cpu); +static void rcu_cleanup_after_idle(void); +static void rcu_prepare_for_idle(void); static void rcu_idle_count_callbacks_posted(void); static void print_cpu_stall_info_begin(void); static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); @@ -589,14 +596,18 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, struct rcu_data *rdp, unsigned long flags); -static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); -static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); +static void rcu_spawn_all_nocb_kthreads(int cpu); +static void __init rcu_spawn_nocb_kthreads(void); +#ifdef CONFIG_RCU_NOCB_CPU +static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); -static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); -static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_enter(int irq); +static void rcu_sysidle_exit(int irq); static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long *maxj); static bool is_sysidle_rcu_state(struct rcu_state *rsp); @@ -605,6 +616,8 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, static void rcu_bind_gp_kthread(void); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); static bool rcu_nohz_full_cpu(struct rcu_state *rsp); +static void rcu_dynticks_task_enter(void); +static void rcu_dynticks_task_exit(void); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a7997e272564..3ec85cb5d544 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -30,14 +30,24 @@ #include <linux/smpboot.h> #include "../time/tick-internal.h" -#define RCU_KTHREAD_PRIO 1 - #ifdef CONFIG_RCU_BOOST + #include "../locking/rtmutex_common.h" -#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO -#else -#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO -#endif + +/* rcuc/rcub kthread realtime priority */ +static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; +module_param(kthread_prio, int, 0644); + +/* + * Control variables for per-CPU and per-rcu_node kthreads. These + * handle all flavors of RCU. + */ +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); +DEFINE_PER_CPU(char, rcu_cpu_has_work); + +#endif /* #ifdef CONFIG_RCU_BOOST */ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ @@ -72,9 +82,6 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE pr_info("\tRCU torture testing starts during boot.\n"); #endif -#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) - pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); -#endif #if defined(CONFIG_RCU_CPU_STALL_INFO) pr_info("\tAdditional per-CPU info printed with stalls.\n"); #endif @@ -85,36 +92,12 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); -#ifdef CONFIG_RCU_NOCB_CPU -#ifndef CONFIG_RCU_NOCB_CPU_NONE - if (!have_rcu_nocb_mask) { - zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); - have_rcu_nocb_mask = true; - } -#ifdef CONFIG_RCU_NOCB_CPU_ZERO - pr_info("\tOffload RCU callbacks from CPU 0\n"); - cpumask_set_cpu(0, rcu_nocb_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ -#ifdef CONFIG_RCU_NOCB_CPU_ALL - pr_info("\tOffload RCU callbacks from all CPUs\n"); - cpumask_copy(rcu_nocb_mask, cpu_possible_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ - if (have_rcu_nocb_mask) { - if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { - pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); - cpumask_and(rcu_nocb_mask, cpu_possible_mask, - rcu_nocb_mask); - } - cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); - pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); - if (rcu_nocb_poll) - pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); - } -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ +#ifdef CONFIG_RCU_BOOST + pr_info("\tRCU kthread priority: %d.\n", kthread_prio); +#endif } -#ifdef CONFIG_TREE_PREEMPT_RCU +#ifdef CONFIG_PREEMPT_RCU RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *rcu_state_p = &rcu_preempt_state; @@ -134,7 +117,7 @@ static void __init rcu_bootup_announce(void) * Return the number of RCU-preempt batches processed thus far * for debug and statistics. */ -long rcu_batches_completed_preempt(void) +static long rcu_batches_completed_preempt(void) { return rcu_preempt_state.completed; } @@ -155,18 +138,19 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); * not in a quiescent state. There might be any number of tasks blocked * while in an RCU read-side critical section. * - * Unlike the other rcu_*_qs() functions, callers to this function - * must disable irqs in order to protect the assignment to - * ->rcu_read_unlock_special. - */ -static void rcu_preempt_qs(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; - current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + * As with the other rcu_*_qs() functions, callers to this function + * must disable preemption. + */ +static void rcu_preempt_qs(void) +{ + if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) { + trace_rcu_grace_period(TPS("rcu_preempt"), + __this_cpu_read(rcu_preempt_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_preempt_data.passed_quiesce, 1); + barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ + current->rcu_read_unlock_special.b.need_qs = false; + } } /* @@ -182,7 +166,7 @@ static void rcu_preempt_qs(int cpu) * * Caller must disable preemption. */ -static void rcu_preempt_note_context_switch(int cpu) +static void rcu_preempt_note_context_switch(void) { struct task_struct *t = current; unsigned long flags; @@ -190,14 +174,14 @@ static void rcu_preempt_note_context_switch(int cpu) struct rcu_node *rnp; if (t->rcu_read_lock_nesting > 0 && - (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { + !t->rcu_read_unlock_special.b.blocked) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); + rdp = this_cpu_ptr(rcu_preempt_state.rda); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; + t->rcu_read_unlock_special.b.blocked = true; t->rcu_blocked_node = rnp; /* @@ -239,7 +223,7 @@ static void rcu_preempt_note_context_switch(int cpu) : rnp->gpnum + 1); raw_spin_unlock_irqrestore(&rnp->lock, flags); } else if (t->rcu_read_lock_nesting < 0 && - t->rcu_read_unlock_special) { + t->rcu_read_unlock_special.s) { /* * Complete exit from RCU read-side critical section on @@ -257,9 +241,7 @@ static void rcu_preempt_note_context_switch(int cpu) * grace period, then the fact that the task has been enqueued * means that we continue to block the current grace period. */ - local_irq_save(flags); - rcu_preempt_qs(cpu); - local_irq_restore(flags); + rcu_preempt_qs(); } /* @@ -340,7 +322,7 @@ void rcu_read_unlock_special(struct task_struct *t) bool drop_boost_mutex = false; #endif /* #ifdef CONFIG_RCU_BOOST */ struct rcu_node *rnp; - int special; + union rcu_special special; /* NMI handlers cannot block and cannot safely manipulate state. */ if (in_nmi()) @@ -350,12 +332,13 @@ void rcu_read_unlock_special(struct task_struct *t) /* * If RCU core is waiting for this CPU to exit critical section, - * let it know that we have done so. + * let it know that we have done so. Because irqs are disabled, + * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; - if (special & RCU_READ_UNLOCK_NEED_QS) { - rcu_preempt_qs(smp_processor_id()); - if (!t->rcu_read_unlock_special) { + if (special.b.need_qs) { + rcu_preempt_qs(); + if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); return; } @@ -368,8 +351,8 @@ void rcu_read_unlock_special(struct task_struct *t) } /* Clean up if blocked during RCU read-side critical section. */ - if (special & RCU_READ_UNLOCK_BLOCKED) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; + if (special.b.blocked) { + t->rcu_read_unlock_special.b.blocked = false; /* * Remove this task from the list it blocked on. The @@ -442,8 +425,6 @@ void rcu_read_unlock_special(struct task_struct *t) } } -#ifdef CONFIG_RCU_CPU_STALL_VERBOSE - /* * Dump detailed information for all tasks blocking the current RCU * grace period on the specified rcu_node structure. @@ -478,14 +459,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) rcu_print_detail_task_stall_rnp(rnp); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ - -static void rcu_print_detail_task_stall(struct rcu_state *rsp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ - #ifdef CONFIG_RCU_CPU_STALL_INFO static void rcu_print_task_stall_begin(struct rcu_node *rnp) @@ -648,17 +621,18 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, * * Caller must disable hard irqs. */ -static void rcu_preempt_check_callbacks(int cpu) +static void rcu_preempt_check_callbacks(void) { struct task_struct *t = current; if (t->rcu_read_lock_nesting == 0) { - rcu_preempt_qs(cpu); + rcu_preempt_qs(); return; } if (t->rcu_read_lock_nesting > 0 && - per_cpu(rcu_preempt_data, cpu).qs_pending) - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; + __this_cpu_read(rcu_preempt_data.qs_pending) && + !__this_cpu_read(rcu_preempt_data.passed_quiesce)) + t->rcu_read_unlock_special.b.need_qs = true; } #ifdef CONFIG_RCU_BOOST @@ -819,11 +793,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) * In fact, if you are using synchronize_rcu_expedited() in a loop, * please restructure your code to batch your updates, and then Use a * single synchronize_rcu() instead. - * - * Note that it is illegal to call this function while holding any lock - * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal - * to call this function from a CPU-hotplug notifier. Failing to observe - * these restriction will result in deadlock. */ void synchronize_rcu_expedited(void) { @@ -845,7 +814,11 @@ void synchronize_rcu_expedited(void) * being boosted. This simplifies the process of moving tasks * from leaf to root rcu_node structures. */ - get_online_cpus(); + if (!try_get_online_cpus()) { + /* CPU-hotplug operation in flight, fall back to normal GP. */ + wait_rcu_gp(call_rcu); + return; + } /* * Acquire lock, falling back to synchronize_rcu() if too many @@ -897,7 +870,8 @@ void synchronize_rcu_expedited(void) /* Clean up and exit. */ smp_mb(); /* ensure expedited GP seen before counter increment. */ - ACCESS_ONCE(sync_rcu_preempt_exp_count)++; + ACCESS_ONCE(sync_rcu_preempt_exp_count) = + sync_rcu_preempt_exp_count + 1; unlock_mb_ret: mutex_unlock(&sync_rcu_preempt_exp_mutex); mb_ret: @@ -941,11 +915,11 @@ void exit_rcu(void) return; t->rcu_read_lock_nesting = 1; barrier(); - t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; + t->rcu_read_unlock_special.b.blocked = true; __rcu_read_unlock(); } -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#else /* #ifdef CONFIG_PREEMPT_RCU */ static struct rcu_state *rcu_state_p = &rcu_sched_state; @@ -971,7 +945,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ -static void rcu_preempt_note_context_switch(int cpu) +static void rcu_preempt_note_context_switch(void) { } @@ -1043,7 +1017,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, * Because preemptible RCU does not exist, it never has any callbacks * to check. */ -static void rcu_preempt_check_callbacks(int cpu) +static void rcu_preempt_check_callbacks(void) { } @@ -1096,7 +1070,7 @@ void exit_rcu(void) { } -#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST @@ -1352,7 +1326,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, smp_mb__after_unlock_lock(); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); - sp.sched_priority = RCU_BOOST_PRIO; + sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ return 0; @@ -1369,7 +1343,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) { struct sched_param sp; - sp.sched_priority = RCU_KTHREAD_PRIO; + sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); } @@ -1462,14 +1436,13 @@ static struct smp_hotplug_thread rcu_cpu_thread_spec = { }; /* - * Spawn all kthreads -- called as soon as the scheduler is running. + * Spawn boost kthreads -- called as soon as the scheduler is running. */ -static int __init rcu_spawn_kthreads(void) +static void __init rcu_spawn_boost_kthreads(void) { struct rcu_node *rnp; int cpu; - rcu_scheduler_fully_active = 1; for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); @@ -1479,9 +1452,7 @@ static int __init rcu_spawn_kthreads(void) rcu_for_each_leaf_node(rcu_state_p, rnp) (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); } - return 0; } -early_initcall(rcu_spawn_kthreads); static void rcu_prepare_kthreads(int cpu) { @@ -1519,12 +1490,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } -static int __init rcu_scheduler_really_started(void) +static void __init rcu_spawn_boost_kthreads(void) { - rcu_scheduler_fully_active = 1; - return 0; } -early_initcall(rcu_scheduler_really_started); static void rcu_prepare_kthreads(int cpu) { @@ -1544,10 +1512,10 @@ static void rcu_prepare_kthreads(int cpu) * any flavor of RCU. */ #ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +int rcu_needs_cpu(unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; - return rcu_cpu_has_callbacks(cpu, NULL); + return rcu_cpu_has_callbacks(NULL); } #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ @@ -1555,7 +1523,7 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up * after it. */ -static void rcu_cleanup_after_idle(int cpu) +static void rcu_cleanup_after_idle(void) { } @@ -1563,7 +1531,7 @@ static void rcu_cleanup_after_idle(int cpu) * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, * is nothing. */ -static void rcu_prepare_for_idle(int cpu) +static void rcu_prepare_for_idle(void) { } @@ -1625,7 +1593,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) /* Exit early if we advanced recently. */ if (jiffies == rdtp->last_advance_all) - return 0; + return false; rdtp->last_advance_all = jiffies; for_each_rcu_flavor(rsp) { @@ -1656,15 +1624,15 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * The caller must have disabled interrupts. */ #ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(int cpu, unsigned long *dj) +int rcu_needs_cpu(unsigned long *dj) { - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { + if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) { *dj = ULONG_MAX; return 0; } @@ -1698,12 +1666,12 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) * * The caller must have disabled interrupts. */ -static void rcu_prepare_for_idle(int cpu) +static void rcu_prepare_for_idle(void) { #ifndef CONFIG_RCU_NOCB_CPU_ALL bool needwake; struct rcu_data *rdp; - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); struct rcu_node *rnp; struct rcu_state *rsp; int tne; @@ -1711,7 +1679,7 @@ static void rcu_prepare_for_idle(int cpu) /* Handle nohz enablement switches conservatively. */ tne = ACCESS_ONCE(tick_nohz_active); if (tne != rdtp->tick_nohz_enabled_snap) { - if (rcu_cpu_has_callbacks(cpu, NULL)) + if (rcu_cpu_has_callbacks(NULL)) invoke_rcu_core(); /* force nohz to see update. */ rdtp->tick_nohz_enabled_snap = tne; return; @@ -1720,7 +1688,7 @@ static void rcu_prepare_for_idle(int cpu) return; /* If this is a no-CBs CPU, no callbacks, just return. */ - if (rcu_is_nocb_cpu(cpu)) + if (rcu_is_nocb_cpu(smp_processor_id())) return; /* @@ -1744,7 +1712,7 @@ static void rcu_prepare_for_idle(int cpu) return; rdtp->last_accelerate = jiffies; for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = this_cpu_ptr(rsp->rda); if (!*rdp->nxttail[RCU_DONE_TAIL]) continue; rnp = rdp->mynode; @@ -1763,10 +1731,10 @@ static void rcu_prepare_for_idle(int cpu) * any grace periods that elapsed while the CPU was idle, and if any * callbacks are now ready to invoke, initiate invocation. */ -static void rcu_cleanup_after_idle(int cpu) +static void rcu_cleanup_after_idle(void) { #ifndef CONFIG_RCU_NOCB_CPU_ALL - if (rcu_is_nocb_cpu(cpu)) + if (rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); @@ -1848,7 +1816,7 @@ static int rcu_oom_notify(struct notifier_block *self, get_online_cpus(); for_each_online_cpu(cpu) { smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); - cond_resched(); + cond_resched_rcu_qs(); } put_online_cpus(); @@ -2075,13 +2043,40 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) return; if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) { - /* Prior xchg orders against prior callback enqueue. */ + /* Prior smp_mb__after_atomic() orders against prior enqueue. */ ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false; wake_up(&rdp_leader->nocb_wq); } } /* + * Does the specified CPU need an RCU callback for the specified flavor + * of rcu_barrier()? + */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_head *rhp; + + /* No-CBs CPUs might have callbacks on any of three lists. */ + rhp = ACCESS_ONCE(rdp->nocb_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_gp_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_follower_head); + + /* Having no rcuo kthread but CBs after scheduler starts is bad! */ + if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) { + /* RCU callback enqueued before CPU first came online??? */ + pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", + cpu, rhp->func); + WARN_ON_ONCE(1); + } + + return !!rhp; +} + +/* * Enqueue the specified string of rcu_head structures onto the specified * CPU's no-CBs lists. The CPU is specified by rdp, the head of the * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy @@ -2104,6 +2099,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, ACCESS_ONCE(*old_rhpp) = rhp; atomic_long_add(rhcount, &rdp->nocb_q_count); atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); + smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ /* If we are not being polled and there is a kthread, awaken it ... */ t = ACCESS_ONCE(rdp->nocb_kthread); @@ -2120,16 +2116,23 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - rdp->nocb_defer_wakeup = true; + rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmptyIsDeferred")); } rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { /* ... or if many callbacks queued. */ - wake_nocb_leader(rdp, true); + if (!irqs_disabled_flags(flags)) { + wake_nocb_leader(rdp, true); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeOvf")); + } else { + rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeOvfIsDeferred")); + } rdp->qlen_last_fqs_check = LONG_MAX / 2; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); } @@ -2150,7 +2153,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, { if (!rcu_is_nocb_cpu(rdp->cpu)) - return 0; + return false; __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); if (__is_kfree_rcu_offset((unsigned long)rhp->func)) trace_rcu_kfree_callback(rdp->rsp->name, rhp, @@ -2161,7 +2164,18 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, trace_rcu_callback(rdp->rsp->name, rhp, -atomic_long_read(&rdp->nocb_q_count_lazy), -atomic_long_read(&rdp->nocb_q_count)); - return 1; + + /* + * If called from an extended quiescent state with interrupts + * disabled, invoke the RCU core in order to allow the idle-entry + * deferred-wakeup check to function. + */ + if (irqs_disabled_flags(flags) && + !rcu_is_watching() && + cpu_online(smp_processor_id())) + invoke_rcu_core(); + + return true; } /* @@ -2177,7 +2191,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ if (!rcu_is_nocb_cpu(smp_processor_id())) - return 0; + return false; rsp->qlen = 0; rsp->qlen_lazy = 0; @@ -2196,7 +2210,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, rsp->orphan_nxtlist = NULL; rsp->orphan_nxttail = &rsp->orphan_nxtlist; } - return 1; + return true; } /* @@ -2229,7 +2243,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); if (likely(d)) break; - flush_signals(current); + WARN_ON(signal_pending(current)); trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); } trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); @@ -2288,7 +2302,7 @@ wait_again: if (!rcu_nocb_poll) trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "WokeEmpty"); - flush_signals(current); + WARN_ON(signal_pending(current)); schedule_timeout_interruptible(1); /* Rescan in case we were a victim of memory ordering. */ @@ -2327,6 +2341,7 @@ wait_again: atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); atomic_long_add(rdp->nocb_gp_count_lazy, &rdp->nocb_follower_count_lazy); + smp_mb__after_atomic(); /* Store *tail before wakeup. */ if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { /* * List was empty, wake up the follower. @@ -2367,7 +2382,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) if (!rcu_nocb_poll) trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeEmpty"); - flush_signals(current); + WARN_ON(signal_pending(current)); schedule_timeout_interruptible(1); } } @@ -2428,15 +2443,16 @@ static int rcu_nocb_kthread(void *arg) list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); - ACCESS_ONCE(rdp->nocb_p_count) -= c; - ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; + ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c; + ACCESS_ONCE(rdp->nocb_p_count_lazy) = + rdp->nocb_p_count_lazy - cl; rdp->n_nocbs_invoked += c; } return 0; } /* Is a deferred wakeup of rcu_nocb_kthread() required? */ -static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) { return ACCESS_ONCE(rdp->nocb_defer_wakeup); } @@ -2444,11 +2460,79 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) /* Do a deferred wakeup of rcu_nocb_kthread(). */ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) { + int ndw; + if (!rcu_nocb_need_deferred_wakeup(rdp)) return; - ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; - wake_nocb_leader(rdp, false); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); + ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup); + ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT; + wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); +} + +void __init rcu_init_nohz(void) +{ + int cpu; + bool need_rcu_nocb_mask = true; + struct rcu_state *rsp; + +#ifdef CONFIG_RCU_NOCB_CPU_NONE + need_rcu_nocb_mask = false; +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ + +#if defined(CONFIG_NO_HZ_FULL) + if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) + need_rcu_nocb_mask = true; +#endif /* #if defined(CONFIG_NO_HZ_FULL) */ + + if (!have_rcu_nocb_mask && need_rcu_nocb_mask) { + if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { + pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); + return; + } + have_rcu_nocb_mask = true; + } + if (!have_rcu_nocb_mask) + return; + +#ifdef CONFIG_RCU_NOCB_CPU_ZERO + pr_info("\tOffload RCU callbacks from CPU 0\n"); + cpumask_set_cpu(0, rcu_nocb_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ +#ifdef CONFIG_RCU_NOCB_CPU_ALL + pr_info("\tOffload RCU callbacks from all CPUs\n"); + cpumask_copy(rcu_nocb_mask, cpu_possible_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ +#if defined(CONFIG_NO_HZ_FULL) + if (tick_nohz_full_running) + cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); +#endif /* #if defined(CONFIG_NO_HZ_FULL) */ + + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { + pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); + cpumask_and(rcu_nocb_mask, cpu_possible_mask, + rcu_nocb_mask); + } + cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); + pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); + if (rcu_nocb_poll) + pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); + + for_each_rcu_flavor(rsp) { + for_each_cpu(cpu, rcu_nocb_mask) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + + /* + * If there are early callbacks, they will need + * to be moved to the nocb lists. + */ + WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] != + &rdp->nxtlist && + rdp->nxttail[RCU_NEXT_TAIL] != NULL); + init_nocb_callback_list(rdp); + } + rcu_organize_nocb_kthreads(rsp); + } } /* Initialize per-rcu_data variables for no-CBs CPUs. */ @@ -2459,15 +2543,89 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) rdp->nocb_follower_tail = &rdp->nocb_follower_head; } +/* + * If the specified CPU is a no-CBs CPU that does not already have its + * rcuo kthread for the specified RCU flavor, spawn it. If the CPUs are + * brought online out of order, this can require re-organizing the + * leader-follower relationships. + */ +static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) +{ + struct rcu_data *rdp; + struct rcu_data *rdp_last; + struct rcu_data *rdp_old_leader; + struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu); + struct task_struct *t; + + /* + * If this isn't a no-CBs CPU or if it already has an rcuo kthread, + * then nothing to do. + */ + if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread) + return; + + /* If we didn't spawn the leader first, reorganize! */ + rdp_old_leader = rdp_spawn->nocb_leader; + if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) { + rdp_last = NULL; + rdp = rdp_old_leader; + do { + rdp->nocb_leader = rdp_spawn; + if (rdp_last && rdp != rdp_spawn) + rdp_last->nocb_next_follower = rdp; + if (rdp == rdp_spawn) { + rdp = rdp->nocb_next_follower; + } else { + rdp_last = rdp; + rdp = rdp->nocb_next_follower; + rdp_last->nocb_next_follower = NULL; + } + } while (rdp); + rdp_spawn->nocb_next_follower = rdp_old_leader; + } + + /* Spawn the kthread for this CPU and RCU flavor. */ + t = kthread_run(rcu_nocb_kthread, rdp_spawn, + "rcuo%c/%d", rsp->abbr, cpu); + BUG_ON(IS_ERR(t)); + ACCESS_ONCE(rdp_spawn->nocb_kthread) = t; +} + +/* + * If the specified CPU is a no-CBs CPU that does not already have its + * rcuo kthreads, spawn them. + */ +static void rcu_spawn_all_nocb_kthreads(int cpu) +{ + struct rcu_state *rsp; + + if (rcu_scheduler_fully_active) + for_each_rcu_flavor(rsp) + rcu_spawn_one_nocb_kthread(rsp, cpu); +} + +/* + * Once the scheduler is running, spawn rcuo kthreads for all online + * no-CBs CPUs. This assumes that the early_initcall()s happen before + * non-boot CPUs come online -- if this changes, we will need to add + * some mutual exclusion. + */ +static void __init rcu_spawn_nocb_kthreads(void) +{ + int cpu; + + for_each_online_cpu(cpu) + rcu_spawn_all_nocb_kthreads(cpu); +} + /* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ static int rcu_nocb_leader_stride = -1; module_param(rcu_nocb_leader_stride, int, 0444); /* - * Create a kthread for each RCU flavor for each no-CBs CPU. - * Also initialize leader-follower relationships. + * Initialize leader-follower relationships for all no-CBs CPU. */ -static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) +static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) { int cpu; int ls = rcu_nocb_leader_stride; @@ -2475,14 +2633,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ struct rcu_data *rdp_prev = NULL; - struct task_struct *t; - if (rcu_nocb_mask == NULL) + if (!have_rcu_nocb_mask) return; -#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) - if (tick_nohz_full_running) - cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); -#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */ if (ls == -1) { ls = int_sqrt(nr_cpu_ids); rcu_nocb_leader_stride = ls; @@ -2505,27 +2658,27 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) rdp_prev->nocb_next_follower = rdp; } rdp_prev = rdp; - - /* Spawn the kthread for this CPU. */ - t = kthread_run(rcu_nocb_kthread, rdp, - "rcuo%c/%d", rsp->abbr, cpu); - BUG_ON(IS_ERR(t)); - ACCESS_ONCE(rdp->nocb_kthread) = t; } } /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ static bool init_nocb_callback_list(struct rcu_data *rdp) { - if (rcu_nocb_mask == NULL || - !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) + if (!rcu_is_nocb_cpu(rdp->cpu)) return false; + rdp->nxttail[RCU_NEXT_TAIL] = NULL; return true; } #else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + WARN_ON_ONCE(1); /* Should be dead code. */ + return false; +} + static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { } @@ -2541,21 +2694,21 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags) { - return 0; + return false; } static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, struct rcu_data *rdp, unsigned long flags) { - return 0; + return false; } static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { } -static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) { return false; } @@ -2564,7 +2717,11 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) { } -static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) +static void rcu_spawn_all_nocb_kthreads(int cpu) +{ +} + +static void __init rcu_spawn_nocb_kthreads(void) { } @@ -2595,16 +2752,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu) #ifdef CONFIG_NO_HZ_FULL_SYSIDLE -/* - * Define RCU flavor that holds sysidle state. This needs to be the - * most active flavor of RCU. - */ -#ifdef CONFIG_PREEMPT_RCU -static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; -#else /* #ifdef CONFIG_PREEMPT_RCU */ -static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; -#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ - static int full_sysidle_state; /* Current system-idle state. */ #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ @@ -2618,9 +2765,14 @@ static int full_sysidle_state; /* Current system-idle state. */ * to detect full-system idle states, not RCU quiescent states and grace * periods. The caller must have disabled interrupts. */ -static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_enter(int irq) { unsigned long j; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + /* If there are no nohz_full= CPUs, no need to track this. */ + if (!tick_nohz_full_enabled()) + return; /* Adjust nesting, check for fully idle. */ if (irq) { @@ -2685,8 +2837,14 @@ void rcu_sysidle_force_exit(void) * usermode execution does -not- count as idle here! The caller must * have disabled interrupts. */ -static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_exit(int irq) { + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + /* If there are no nohz_full= CPUs, no need to track this. */ + if (!tick_nohz_full_enabled()) + return; + /* Adjust nesting, check for already non-idle. */ if (irq) { rdtp->dynticks_idle_nesting++; @@ -2741,12 +2899,16 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long j; struct rcu_dynticks *rdtp = rdp->dynticks; + /* If there are no nohz_full= CPUs, don't check system-wide idleness. */ + if (!tick_nohz_full_enabled()) + return; + /* * If some other CPU has already reported non-idle, if this is * not the flavor of RCU that tracks sysidle state, or if this * is an offline or the timekeeping CPU, nothing to do. */ - if (!*isidle || rdp->rsp != rcu_sysidle_state || + if (!*isidle || rdp->rsp != rcu_state_p || cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) return; if (rcu_gp_in_progress(rdp->rsp)) @@ -2772,7 +2934,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, */ static bool is_sysidle_rcu_state(struct rcu_state *rsp) { - return rsp == rcu_sysidle_state; + return rsp == rcu_state_p; } /* @@ -2850,7 +3012,7 @@ static void rcu_sysidle_cancel(void) static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, unsigned long maxj, bool gpkt) { - if (rsp != rcu_sysidle_state) + if (rsp != rcu_state_p) return; /* Wrong flavor, ignore. */ if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) return; /* Running state machine from timekeeping CPU. */ @@ -2867,6 +3029,10 @@ static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, unsigned long maxj) { + /* If there are no nohz_full= CPUs, no need to track this. */ + if (!tick_nohz_full_enabled()) + return; + rcu_sysidle_report(rsp, isidle, maxj, true); } @@ -2893,7 +3059,8 @@ static void rcu_sysidle_cb(struct rcu_head *rhp) /* * Check to see if the system is fully idle, other than the timekeeping CPU. - * The caller must have disabled interrupts. + * The caller must have disabled interrupts. This is not intended to be + * called unless tick_nohz_full_enabled(). */ bool rcu_sys_is_idle(void) { @@ -2919,13 +3086,12 @@ bool rcu_sys_is_idle(void) /* Scan all the CPUs looking for nonidle CPUs. */ for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); + rdp = per_cpu_ptr(rcu_state_p->rda, cpu); rcu_sysidle_check_cpu(rdp, &isidle, &maxj); if (!isidle) break; } - rcu_sysidle_report(rcu_sysidle_state, - isidle, maxj, false); + rcu_sysidle_report(rcu_state_p, isidle, maxj, false); oldrss = rss; rss = ACCESS_ONCE(full_sysidle_state); } @@ -2952,7 +3118,7 @@ bool rcu_sys_is_idle(void) * provided by the memory allocator. */ if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && - !rcu_gp_in_progress(rcu_sysidle_state) && + !rcu_gp_in_progress(rcu_state_p) && !rsh.inuse && xchg(&rsh.inuse, 1) == 0) call_rcu(&rsh.rh, rcu_sysidle_cb); return false; @@ -2968,11 +3134,11 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ -static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_enter(int irq) { } -static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_exit(int irq) { } @@ -3036,3 +3202,19 @@ static void rcu_bind_gp_kthread(void) housekeeping_affine(current); #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } + +/* Record the current task on dyntick-idle entry. */ +static void rcu_dynticks_task_enter(void) +{ +#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) + ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id(); +#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ +} + +/* Record no current task on dyntick-idle exit. */ +static void rcu_dynticks_task_exit(void) +{ +#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) + ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1; +#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ +} diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4056d7992a6c..e0d31a345ee6 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -47,6 +47,8 @@ #include <linux/hardirq.h> #include <linux/delay.h> #include <linux/module.h> +#include <linux/kthread.h> +#include <linux/tick.h> #define CREATE_TRACE_POINTS @@ -91,7 +93,7 @@ void __rcu_read_unlock(void) barrier(); /* critical section before exit code. */ t->rcu_read_lock_nesting = INT_MIN; barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); barrier(); /* ->rcu_read_unlock_special load before assign */ t->rcu_read_lock_nesting = 0; @@ -137,6 +139,38 @@ int notrace debug_lockdep_rcu_enabled(void) EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); /** + * rcu_read_lock_held() - might we be in RCU read-side critical section? + * + * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU + * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, + * this assumes we are in an RCU read-side critical section unless it can + * prove otherwise. This is useful for debug checks in functions that + * require that they be called within an RCU read-side critical section. + * + * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot + * and while lockdep is disabled. + * + * Note that rcu_read_lock() and the matching rcu_read_unlock() must + * occur in the same context, for example, it is illegal to invoke + * rcu_read_unlock() in process context if the matching rcu_read_lock() + * was invoked from within an irq handler. + * + * Note that rcu_read_lock() is disallowed if the CPU is either idle or + * offline from an RCU perspective, so check for those as well. + */ +int rcu_read_lock_held(void) +{ + if (!debug_lockdep_rcu_enabled()) + return 1; + if (!rcu_is_watching()) + return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; + return lock_is_held(&rcu_lock_map); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_held); + +/** * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? * * Check for bottom half being disabled, which covers both the @@ -272,7 +306,7 @@ struct debug_obj_descr rcuhead_debug_descr = { EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, unsigned long c) @@ -347,3 +381,397 @@ static int __init check_cpu_stall_init(void) early_initcall(check_cpu_stall_init); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ + +#ifdef CONFIG_TASKS_RCU + +/* + * Simple variant of RCU whose quiescent states are voluntary context switch, + * user-space execution, and idle. As such, grace periods can take one good + * long time. There are no read-side primitives similar to rcu_read_lock() + * and rcu_read_unlock() because this implementation is intended to get + * the system into a safe state for some of the manipulations involved in + * tracing and the like. Finally, this implementation does not support + * high call_rcu_tasks() rates from multiple CPUs. If this is required, + * per-CPU callback lists will be needed. + */ + +/* Global list of callbacks and associated lock. */ +static struct rcu_head *rcu_tasks_cbs_head; +static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; +static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); +static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); + +/* Track exiting tasks in order to allow them to be waited for. */ +DEFINE_SRCU(tasks_rcu_exit_srcu); + +/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ +static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; +module_param(rcu_task_stall_timeout, int, 0644); + +static void rcu_spawn_tasks_kthread(void); + +/* + * Post an RCU-tasks callback. First call must be from process context + * after the scheduler if fully operational. + */ +void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) +{ + unsigned long flags; + bool needwake; + + rhp->next = NULL; + rhp->func = func; + raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); + needwake = !rcu_tasks_cbs_head; + *rcu_tasks_cbs_tail = rhp; + rcu_tasks_cbs_tail = &rhp->next; + raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); + if (needwake) { + rcu_spawn_tasks_kthread(); + wake_up(&rcu_tasks_cbs_wq); + } +} +EXPORT_SYMBOL_GPL(call_rcu_tasks); + +/** + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_rcu_qs(), idle execution, userspace execution, calls + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function + * preambles and profiling hooks. The synchronize_rcu_tasks() function + * is not (yet) intended for heavy use from multiple CPUs. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_rcu_tasks() returns, + * each CPU is guaranteed to have executed a full memory barrier since the + * end of its last RCU-tasks read-side critical section whose beginning + * preceded the call to synchronize_rcu_tasks(). In addition, each CPU + * having an RCU-tasks read-side critical section that extends beyond + * the return from synchronize_rcu_tasks() is guaranteed to have executed + * a full memory barrier after the beginning of synchronize_rcu_tasks() + * and before the beginning of that RCU-tasks read-side critical section. + * Note that these guarantees include CPUs that are offline, idle, or + * executing in user mode, as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU + * (but again only if the system has more than one CPU). + */ +void synchronize_rcu_tasks(void) +{ + /* Complain if the scheduler has not started. */ + rcu_lockdep_assert(!rcu_scheduler_active, + "synchronize_rcu_tasks called too soon"); + + /* Wait for the grace period. */ + wait_rcu_gp(call_rcu_tasks); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); + +/** + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks); + +/* See if tasks are still holding out, complain if so. */ +static void check_holdout_task(struct task_struct *t, + bool needreport, bool *firstreport) +{ + int cpu; + + if (!ACCESS_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) || + !ACCESS_ONCE(t->on_rq) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { + ACCESS_ONCE(t->rcu_tasks_holdout) = false; + list_del_init(&t->rcu_tasks_holdout_list); + put_task_struct(t); + return; + } + if (!needreport) + return; + if (*firstreport) { + pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); + *firstreport = false; + } + cpu = task_cpu(t); + pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", + t, ".I"[is_idle_task(t)], + "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], + t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, + t->rcu_tasks_idle_cpu, cpu); + sched_show_task(t); +} + +/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ +static int __noreturn rcu_tasks_kthread(void *arg) +{ + unsigned long flags; + struct task_struct *g, *t; + unsigned long lastreport; + struct rcu_head *list; + struct rcu_head *next; + LIST_HEAD(rcu_tasks_holdouts); + + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ + housekeeping_affine(current); + + /* + * Each pass through the following loop makes one check for + * newly arrived callbacks, and, if there are some, waits for + * one RCU-tasks grace period and then invokes the callbacks. + * This loop is terminated by the system going down. ;-) + */ + for (;;) { + + /* Pick up any new callbacks. */ + raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); + list = rcu_tasks_cbs_head; + rcu_tasks_cbs_head = NULL; + rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; + raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); + + /* If there were none, wait a bit and start over. */ + if (!list) { + wait_event_interruptible(rcu_tasks_cbs_wq, + rcu_tasks_cbs_head); + if (!rcu_tasks_cbs_head) { + WARN_ON(signal_pending(current)); + schedule_timeout_interruptible(HZ/10); + } + continue; + } + + /* + * Wait for all pre-existing t->on_rq and t->nvcsw + * transitions to complete. Invoking synchronize_sched() + * suffices because all these transitions occur with + * interrupts disabled. Without this synchronize_sched(), + * a read-side critical section that started before the + * grace period might be incorrectly seen as having started + * after the grace period. + * + * This synchronize_sched() also dispenses with the + * need for a memory barrier on the first store to + * ->rcu_tasks_holdout, as it forces the store to happen + * after the beginning of the grace period. + */ + synchronize_sched(); + + /* + * There were callbacks, so we need to wait for an + * RCU-tasks grace period. Start off by scanning + * the task list for tasks that are not already + * voluntarily blocked. Mark these tasks and make + * a list of them in rcu_tasks_holdouts. + */ + rcu_read_lock(); + for_each_process_thread(g, t) { + if (t != current && ACCESS_ONCE(t->on_rq) && + !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw); + ACCESS_ONCE(t->rcu_tasks_holdout) = true; + list_add(&t->rcu_tasks_holdout_list, + &rcu_tasks_holdouts); + } + } + rcu_read_unlock(); + + /* + * Wait for tasks that are in the process of exiting. + * This does only part of the job, ensuring that all + * tasks that were previously exiting reach the point + * where they have disabled preemption, allowing the + * later synchronize_sched() to finish the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); + + /* + * Each pass through the following loop scans the list + * of holdout tasks, removing any that are no longer + * holdouts. When the list is empty, we are done. + */ + lastreport = jiffies; + while (!list_empty(&rcu_tasks_holdouts)) { + bool firstreport; + bool needreport; + int rtst; + struct task_struct *t1; + + schedule_timeout_interruptible(HZ); + rtst = ACCESS_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && + time_after(jiffies, lastreport + rtst); + if (needreport) + lastreport = jiffies; + firstreport = true; + WARN_ON(signal_pending(current)); + list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, + rcu_tasks_holdout_list) { + check_holdout_task(t, needreport, &firstreport); + cond_resched(); + } + } + + /* + * Because ->on_rq and ->nvcsw are not guaranteed + * to have a full memory barriers prior to them in the + * schedule() path, memory reordering on other CPUs could + * cause their RCU-tasks read-side critical sections to + * extend past the end of the grace period. However, + * because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_sched() + * to force the needed ordering on all such CPUs. + * + * This synchronize_sched() also confines all + * ->rcu_tasks_holdout accesses to be within the grace + * period, avoiding the need for memory barriers for + * ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_sched() waits for exiting + * tasks to complete their final preempt_disable() region + * of execution, cleaning up after the synchronize_srcu() + * above. + */ + synchronize_sched(); + + /* Invoke the callbacks. */ + while (list) { + next = list->next; + local_bh_disable(); + list->func(list); + local_bh_enable(); + list = next; + cond_resched(); + } + schedule_timeout_uninterruptible(HZ/10); + } +} + +/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */ +static void rcu_spawn_tasks_kthread(void) +{ + static DEFINE_MUTEX(rcu_tasks_kthread_mutex); + static struct task_struct *rcu_tasks_kthread_ptr; + struct task_struct *t; + + if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) { + smp_mb(); /* Ensure caller sees full kthread. */ + return; + } + mutex_lock(&rcu_tasks_kthread_mutex); + if (rcu_tasks_kthread_ptr) { + mutex_unlock(&rcu_tasks_kthread_mutex); + return; + } + t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); + BUG_ON(IS_ERR(t)); + smp_mb(); /* Ensure others see full kthread. */ + ACCESS_ONCE(rcu_tasks_kthread_ptr) = t; + mutex_unlock(&rcu_tasks_kthread_mutex); +} + +#endif /* #ifdef CONFIG_TASKS_RCU */ + +#ifdef CONFIG_PROVE_RCU + +/* + * Early boot self test parameters, one for each flavor + */ +static bool rcu_self_test; +static bool rcu_self_test_bh; +static bool rcu_self_test_sched; + +module_param(rcu_self_test, bool, 0444); +module_param(rcu_self_test_bh, bool, 0444); +module_param(rcu_self_test_sched, bool, 0444); + +static int rcu_self_test_counter; + +static void test_callback(struct rcu_head *r) +{ + rcu_self_test_counter++; + pr_info("RCU test callback executed %d\n", rcu_self_test_counter); +} + +static void early_boot_test_call_rcu(void) +{ + static struct rcu_head head; + + call_rcu(&head, test_callback); +} + +static void early_boot_test_call_rcu_bh(void) +{ + static struct rcu_head head; + + call_rcu_bh(&head, test_callback); +} + +static void early_boot_test_call_rcu_sched(void) +{ + static struct rcu_head head; + + call_rcu_sched(&head, test_callback); +} + +void rcu_early_boot_tests(void) +{ + pr_info("Running RCU self tests\n"); + + if (rcu_self_test) + early_boot_test_call_rcu(); + if (rcu_self_test_bh) + early_boot_test_call_rcu_bh(); + if (rcu_self_test_sched) + early_boot_test_call_rcu_sched(); +} + +static int rcu_verify_early_boot_tests(void) +{ + int ret = 0; + int early_boot_test_counter = 0; + + if (rcu_self_test) { + early_boot_test_counter++; + rcu_barrier(); + } + if (rcu_self_test_bh) { + early_boot_test_counter++; + rcu_barrier_bh(); + } + if (rcu_self_test_sched) { + early_boot_test_counter++; + rcu_barrier_sched(); + } + + if (rcu_self_test_counter != early_boot_test_counter) { + WARN_ON(1); + ret = -1; + } + + return ret; +} +late_initcall(rcu_verify_early_boot_tests); +#else +void rcu_early_boot_tests(void) {} +#endif /* CONFIG_PROVE_RCU */ diff --git a/kernel/reboot.c b/kernel/reboot.c index a3a9e240fcdb..5925f5ae8dff 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -104,6 +104,87 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +/* + * Notifier list for kernel code which wants to be called + * to restart the system. + */ +static ATOMIC_NOTIFIER_HEAD(restart_handler_list); + +/** + * register_restart_handler - Register function to be called to reset + * the system + * @nb: Info about handler function to be called + * @nb->priority: Handler priority. Handlers should follow the + * following guidelines for setting priorities. + * 0: Restart handler of last resort, + * with limited restart capabilities + * 128: Default restart handler; use if no other + * restart handler is expected to be available, + * and/or if restart functionality is + * sufficient to restart the entire system + * 255: Highest priority restart handler, will + * preempt all other restart handlers + * + * Registers a function with code to be called to restart the + * system. + * + * Registered functions will be called from machine_restart as last + * step of the restart sequence (if the architecture specific + * machine_restart function calls do_kernel_restart - see below + * for details). + * Registered functions are expected to restart the system immediately. + * If more than one function is registered, the restart handler priority + * selects which function will be called first. + * + * Restart handlers are expected to be registered from non-architecture + * code, typically from drivers. A typical use case would be a system + * where restart functionality is provided through a watchdog. Multiple + * restart handlers may exist; for example, one restart handler might + * restart the entire system, while another only restarts the CPU. + * In such cases, the restart handler which only restarts part of the + * hardware is expected to register with low priority to ensure that + * it only runs if no other means to restart the system is available. + * + * Currently always returns zero, as atomic_notifier_chain_register() + * always returns zero. + */ +int register_restart_handler(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&restart_handler_list, nb); +} +EXPORT_SYMBOL(register_restart_handler); + +/** + * unregister_restart_handler - Unregister previously registered + * restart handler + * @nb: Hook to be unregistered + * + * Unregisters a previously registered restart handler function. + * + * Returns zero on success, or %-ENOENT on failure. + */ +int unregister_restart_handler(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&restart_handler_list, nb); +} +EXPORT_SYMBOL(unregister_restart_handler); + +/** + * do_kernel_restart - Execute kernel restart handler call chain + * + * Calls functions registered with register_restart_handler. + * + * Expected to be called from machine_restart as last step of the restart + * sequence. + * + * Restarts the system immediately if a restart handler function has been + * registered. Otherwise does nothing. + */ +void do_kernel_restart(char *cmd) +{ + atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd); +} + void migrate_to_reboot_cpu(void) { /* The boot cpu is always logical cpu 0 */ diff --git a/kernel/res_counter.c b/kernel/res_counter.c deleted file mode 100644 index e791130f85a7..000000000000 --- a/kernel/res_counter.c +++ /dev/null @@ -1,211 +0,0 @@ -/* - * resource cgroups - * - * Copyright 2007 OpenVZ SWsoft Inc - * - * Author: Pavel Emelianov <xemul@openvz.org> - * - */ - -#include <linux/types.h> -#include <linux/parser.h> -#include <linux/fs.h> -#include <linux/res_counter.h> -#include <linux/uaccess.h> -#include <linux/mm.h> - -void res_counter_init(struct res_counter *counter, struct res_counter *parent) -{ - spin_lock_init(&counter->lock); - counter->limit = RES_COUNTER_MAX; - counter->soft_limit = RES_COUNTER_MAX; - counter->parent = parent; -} - -static u64 res_counter_uncharge_locked(struct res_counter *counter, - unsigned long val) -{ - if (WARN_ON(counter->usage < val)) - val = counter->usage; - - counter->usage -= val; - return counter->usage; -} - -static int res_counter_charge_locked(struct res_counter *counter, - unsigned long val, bool force) -{ - int ret = 0; - - if (counter->usage + val > counter->limit) { - counter->failcnt++; - ret = -ENOMEM; - if (!force) - return ret; - } - - counter->usage += val; - if (counter->usage > counter->max_usage) - counter->max_usage = counter->usage; - return ret; -} - -static int __res_counter_charge(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at, bool force) -{ - int ret, r; - unsigned long flags; - struct res_counter *c, *u; - - r = ret = 0; - *limit_fail_at = NULL; - local_irq_save(flags); - for (c = counter; c != NULL; c = c->parent) { - spin_lock(&c->lock); - r = res_counter_charge_locked(c, val, force); - spin_unlock(&c->lock); - if (r < 0 && !ret) { - ret = r; - *limit_fail_at = c; - if (!force) - break; - } - } - - if (ret < 0 && !force) { - for (u = counter; u != c; u = u->parent) { - spin_lock(&u->lock); - res_counter_uncharge_locked(u, val); - spin_unlock(&u->lock); - } - } - local_irq_restore(flags); - - return ret; -} - -int res_counter_charge(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at) -{ - return __res_counter_charge(counter, val, limit_fail_at, false); -} - -int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at) -{ - return __res_counter_charge(counter, val, limit_fail_at, true); -} - -u64 res_counter_uncharge_until(struct res_counter *counter, - struct res_counter *top, - unsigned long val) -{ - unsigned long flags; - struct res_counter *c; - u64 ret = 0; - - local_irq_save(flags); - for (c = counter; c != top; c = c->parent) { - u64 r; - spin_lock(&c->lock); - r = res_counter_uncharge_locked(c, val); - if (c == counter) - ret = r; - spin_unlock(&c->lock); - } - local_irq_restore(flags); - return ret; -} - -u64 res_counter_uncharge(struct res_counter *counter, unsigned long val) -{ - return res_counter_uncharge_until(counter, NULL, val); -} - -static inline unsigned long long * -res_counter_member(struct res_counter *counter, int member) -{ - switch (member) { - case RES_USAGE: - return &counter->usage; - case RES_MAX_USAGE: - return &counter->max_usage; - case RES_LIMIT: - return &counter->limit; - case RES_FAILCNT: - return &counter->failcnt; - case RES_SOFT_LIMIT: - return &counter->soft_limit; - }; - - BUG(); - return NULL; -} - -ssize_t res_counter_read(struct res_counter *counter, int member, - const char __user *userbuf, size_t nbytes, loff_t *pos, - int (*read_strategy)(unsigned long long val, char *st_buf)) -{ - unsigned long long *val; - char buf[64], *s; - - s = buf; - val = res_counter_member(counter, member); - if (read_strategy) - s += read_strategy(*val, s); - else - s += sprintf(s, "%llu\n", *val); - return simple_read_from_buffer((void __user *)userbuf, nbytes, - pos, buf, s - buf); -} - -#if BITS_PER_LONG == 32 -u64 res_counter_read_u64(struct res_counter *counter, int member) -{ - unsigned long flags; - u64 ret; - - spin_lock_irqsave(&counter->lock, flags); - ret = *res_counter_member(counter, member); - spin_unlock_irqrestore(&counter->lock, flags); - - return ret; -} -#else -u64 res_counter_read_u64(struct res_counter *counter, int member) -{ - return *res_counter_member(counter, member); -} -#endif - -int res_counter_memparse_write_strategy(const char *buf, - unsigned long long *resp) -{ - char *end; - unsigned long long res; - - /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ - if (*buf == '-') { - int rc = kstrtoull(buf + 1, 10, &res); - - if (rc) - return rc; - if (res != 1) - return -EINVAL; - *resp = RES_COUNTER_MAX; - return 0; - } - - res = memparse(buf, &end); - if (*end != '\0') - return -EINVAL; - - if (PAGE_ALIGN(res) >= res) - res = PAGE_ALIGN(res); - else - res = RES_COUNTER_MAX; - - *resp = res; - - return 0; -} diff --git a/kernel/resource.c b/kernel/resource.c index 60c5a3856ab7..0bcebffc4e77 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -491,6 +491,42 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); +/* + * Search for a resouce entry that fully contains the specified region. + * If found, return 1 if it is RAM, 0 if not. + * If not found, or region is not fully contained, return -1 + * + * Used by the ioremap functions to ensure the user is not remapping RAM and is + * a vast speed up over walking through the resource table page by page. + */ +int region_is_ram(resource_size_t start, unsigned long size) +{ + struct resource *p; + resource_size_t end = start + size - 1; + int flags = IORESOURCE_MEM | IORESOURCE_BUSY; + const char *name = "System RAM"; + int ret = -1; + + read_lock(&resource_lock); + for (p = iomem_resource.child; p ; p = p->sibling) { + if (end < p->start) + continue; + + if (p->start <= start && end <= p->end) { + /* resource fully contains region */ + if ((p->flags != flags) || strcmp(p->name, name)) + ret = 0; + else + ret = 1; + break; + } + if (p->end < start) + break; /* not found */ + } + read_unlock(&resource_lock); + return ret; +} + void __weak arch_remove_reservations(struct resource *avail) { } @@ -1245,6 +1281,76 @@ int release_mem_region_adjustable(struct resource *parent, /* * Managed region resource */ +static void devm_resource_release(struct device *dev, void *ptr) +{ + struct resource **r = ptr; + + release_resource(*r); +} + +/** + * devm_request_resource() - request and reserve an I/O or memory resource + * @dev: device for which to request the resource + * @root: root of the resource tree from which to request the resource + * @new: descriptor of the resource to request + * + * This is a device-managed version of request_resource(). There is usually + * no need to release resources requested by this function explicitly since + * that will be taken care of when the device is unbound from its driver. + * If for some reason the resource needs to be released explicitly, because + * of ordering issues for example, drivers must call devm_release_resource() + * rather than the regular release_resource(). + * + * When a conflict is detected between any existing resources and the newly + * requested resource, an error message will be printed. + * + * Returns 0 on success or a negative error code on failure. + */ +int devm_request_resource(struct device *dev, struct resource *root, + struct resource *new) +{ + struct resource *conflict, **ptr; + + ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return -ENOMEM; + + *ptr = new; + + conflict = request_resource_conflict(root, new); + if (conflict) { + dev_err(dev, "resource collision: %pR conflicts with %s %pR\n", + new, conflict->name, conflict); + devres_free(ptr); + return -EBUSY; + } + + devres_add(dev, ptr); + return 0; +} +EXPORT_SYMBOL(devm_request_resource); + +static int devm_resource_match(struct device *dev, void *res, void *data) +{ + struct resource **ptr = res; + + return *ptr == data; +} + +/** + * devm_release_resource() - release a previously requested resource + * @dev: device for which to release the resource + * @new: descriptor of the resource to release + * + * Releases a resource previously requested using devm_request_resource(). + */ +void devm_release_resource(struct device *dev, struct resource *new) +{ + WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match, + new)); +} +EXPORT_SYMBOL(devm_release_resource); + struct region_devres { struct resource *parent; resource_size_t start; diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e73efba98301..8a2e230fb86a 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -148,11 +148,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) goto out; - t = p; - do { + for_each_thread(p, t) sched_move_task(t); - } while_each_thread(p, t); - out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 3ef6451e972e..c27e4f8f4879 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -134,7 +134,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); static inline struct sched_clock_data *this_scd(void) { - return &__get_cpu_var(sched_clock_data); + return this_cpu_ptr(&sched_clock_data); } static inline struct sched_clock_data *cpu_sdc(int cpu) diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a63f4dc27909..607f852b4d04 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout); * * This waits to be signaled for completion of a specific task. It is NOT * interruptible and there is no timeout. The caller is accounted as waiting - * for IO. + * for IO (which traditionally means blkio only). */ void __sched wait_for_completion_io(struct completion *x) { @@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io); * * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. The caller is accounted as waiting for IO. + * interruptible. The caller is accounted as waiting for IO (which traditionally + * means blkio only). * * Return: 0 if timed out, and positive (at least 1, or number of jiffies left * till timeout) if completed. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ec1a286684a5..c0accc00566e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -90,22 +90,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -#ifdef smp_mb__before_atomic -void __smp_mb__before_atomic(void) -{ - smp_mb__before_atomic(); -} -EXPORT_SYMBOL(__smp_mb__before_atomic); -#endif - -#ifdef smp_mb__after_atomic -void __smp_mb__after_atomic(void) -{ - smp_mb__after_atomic(); -} -EXPORT_SYMBOL(__smp_mb__after_atomic); -#endif - void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; @@ -333,9 +317,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) for (;;) { rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) return rq; raw_spin_unlock(&rq->lock); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); } } @@ -352,10 +339,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) raw_spin_lock_irqsave(&p->pi_lock, *flags); rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) return rq; raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); } } @@ -449,7 +439,15 @@ static void __hrtick_start(void *arg) void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time = ktime_add_ns(timer->base->get_time(), delay); + ktime_t time; + s64 delta; + + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense and can cause timer DoS. + */ + delta = max_t(s64, delay, 10000LL); + time = ktime_add_ns(timer->base->get_time(), delta); hrtimer_set_expires(timer, time); @@ -1010,6 +1008,9 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } +/* + * Can drop rq->lock because from sched_class::switched_from() methods drop it. + */ static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio) @@ -1017,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class != p->sched_class) { if (prev_class->switched_from) prev_class->switched_from(rq, p); + /* Possble rq->lock 'hole'. */ p->sched_class->switched_to(rq, p); } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); @@ -1043,7 +1045,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) + if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) rq->skip_clock_update = 1; } @@ -1056,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_preempt_count(p) & PREEMPT_ACTIVE)); + !p->on_rq); #ifdef CONFIG_LOCKDEP /* @@ -1088,7 +1090,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) static void __migrate_swap_task(struct task_struct *p, int cpu) { - if (p->on_rq) { + if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; src_rq = task_rq(p); @@ -1214,7 +1216,7 @@ static int migration_cpu_stop(void *data); unsigned long wait_task_inactive(struct task_struct *p, long match_state) { unsigned long flags; - int running, on_rq; + int running, queued; unsigned long ncsw; struct rq *rq; @@ -1252,7 +1254,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(rq, p); - on_rq = p->on_rq; + queued = task_on_rq_queued(p); ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ @@ -1284,7 +1286,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * running right now), it's preempted, and we should * yield - it could be a while. */ - if (unlikely(on_rq)) { + if (unlikely(queued)) { ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); set_current_state(TASK_UNINTERRUPTIBLE); @@ -1409,7 +1411,8 @@ out: static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + if (p->nr_cpus_allowed > 1) + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1478,7 +1481,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); - p->on_rq = 1; + p->on_rq = TASK_ON_RQ_QUEUED; /* if a worker is waking up, notify workqueue */ if (p->flags & PF_WQ_WORKER) @@ -1537,7 +1540,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) int ret = 0; rq = __task_rq_lock(p); - if (p->on_rq) { + if (task_on_rq_queued(p)) { /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); ttwu_do_wakeup(rq, p, wake_flags); @@ -1620,6 +1623,30 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) } } +void wake_up_if_idle(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + rcu_read_lock(); + + if (!is_idle_task(rcu_dereference(rq->curr))) + goto out; + + if (set_nr_if_polling(rq->idle)) { + trace_sched_wake_idle_without_ipi(cpu); + } else { + raw_spin_lock_irqsave(&rq->lock, flags); + if (is_idle_task(rq->curr)) + smp_send_reschedule(cpu); + /* Else cpu is not in idle, do nothing here */ + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + +out: + rcu_read_unlock(); +} + bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); @@ -1742,7 +1769,7 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) goto out; - if (!p->on_rq) + if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0); @@ -1776,6 +1803,20 @@ int wake_up_state(struct task_struct *p, unsigned int state) } /* + * This function clears the sched_dl_entity static params. + */ +void __dl_clear_params(struct task_struct *p) +{ + struct sched_dl_entity *dl_se = &p->dl; + + dl_se->dl_runtime = 0; + dl_se->dl_deadline = 0; + dl_se->dl_period = 0; + dl_se->flags = 0; + dl_se->dl_bw = 0; +} + +/* * Perform scheduler related setup for a newly forked process p. * p is forked by current. * @@ -1799,10 +1840,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) RB_CLEAR_NODE(&p->dl.rb_node); hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - p->dl.dl_runtime = p->dl.runtime = 0; - p->dl.dl_deadline = p->dl.deadline = 0; - p->dl.dl_period = 0; - p->dl.flags = 0; + __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); @@ -1825,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; - p->numa_faults_memory = NULL; - p->numa_faults_buffer_memory = NULL; + p->numa_faults = NULL; p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; - INIT_LIST_HEAD(&p->numa_entry); p->numa_group = NULL; #endif /* CONFIG_NUMA_BALANCING */ } @@ -1977,6 +2013,8 @@ unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_SMP inline struct dl_bw *dl_bw_of(int i) { + rcu_lockdep_assert(rcu_read_lock_sched_held(), + "sched RCU must be held"); return &cpu_rq(i)->rd->dl_bw; } @@ -1985,6 +2023,8 @@ static inline int dl_bw_cpus(int i) struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; + rcu_lockdep_assert(rcu_read_lock_sched_held(), + "sched RCU must be held"); for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; @@ -2002,25 +2042,6 @@ static inline int dl_bw_cpus(int i) } #endif -static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) -{ - dl_b->total_bw -= tsk_bw; -} - -static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) -{ - dl_b->total_bw += tsk_bw; -} - -static inline -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) -{ - return dl_b->bw != -1 && - dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; -} - /* * We must be sure that accepting a new task (or allowing changing the * parameters of an existing one) is consistent with the bandwidth @@ -2095,7 +2116,7 @@ void wake_up_new_task(struct task_struct *p) init_task_runnable_average(p); rq = __task_rq_lock(p); activate_task(rq, p, 0); - p->on_rq = 1; + p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -2188,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, /** * finish_task_switch - clean up after a task-switch - * @rq: runqueue associated with task-switch * @prev: the thread we just switched away from. * * finish_task_switch must be called after the context switch, paired @@ -2200,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) + * + * The context switch have flipped the stack from under us and restored the + * local variables which were saved when this task called schedule() in the + * past. prev == current is still correct but we need to recalculate this_rq + * because prev may have moved to another CPU. */ -static void finish_task_switch(struct rq *rq, struct task_struct *prev) +static struct rq *finish_task_switch(struct task_struct *prev) __releases(rq->lock) { + struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; long prev_state; @@ -2243,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) } tick_nohz_task_switch(current); + return rq; } #ifdef CONFIG_SMP @@ -2277,29 +2304,22 @@ static inline void post_schedule(struct rq *rq) asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq = this_rq(); - - finish_task_switch(rq, prev); + struct rq *rq; - /* - * FIXME: do we need to worry about rq being invalidated by the - * task_switch? - */ + /* finish_task_switch() drops rq->lock and enables preemtion */ + preempt_disable(); + rq = finish_task_switch(prev); post_schedule(rq); - -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); -#endif + if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } /* - * context_switch - switch to the new MM and the new - * thread's register state. + * context_switch - switch to the new MM and the new thread's register state. */ -static inline void +static inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2333,21 +2353,14 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ -#ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); -#endif context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); - barrier(); - /* - * this_rq must be evaluated again because prev may have moved - * CPUs since it called schedule(), thus the 'rq' on its stack - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); + + return finish_task_switch(prev); } /* @@ -2366,6 +2379,18 @@ unsigned long nr_running(void) return sum; } +/* + * Check if only the current task is running on the cpu. + */ +bool single_task_running(void) +{ + if (cpu_rq(smp_processor_id())->nr_running == 1) + return true; + else + return false; +} +EXPORT_SYMBOL(single_task_running); + unsigned long long nr_context_switches(void) { int i; @@ -2437,44 +2462,6 @@ EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); /* - * Return any ns on the sched_clock that have not yet been accounted in - * @p in case that task is currently running. - * - * Called with task_rq_lock() held on @rq. - */ -static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) -{ - u64 ns = 0; - - /* - * Must be ->curr _and_ ->on_rq. If dequeued, we would - * project cycles that may never be accounted to this - * thread, breaking clock_gettime(). - */ - if (task_current(rq, p) && p->on_rq) { - update_rq_clock(rq); - ns = rq_clock_task(rq) - p->se.exec_start; - if ((s64)ns < 0) - ns = 0; - } - - return ns; -} - -unsigned long long task_delta_exec(struct task_struct *p) -{ - unsigned long flags; - struct rq *rq; - u64 ns = 0; - - rq = task_rq_lock(p, &flags); - ns = do_task_delta_exec(p, rq); - task_rq_unlock(rq, p, &flags); - - return ns; -} - -/* * Return accounted runtime for the task. * In case the task is currently running, return the runtime plus current's * pending runtime that have not been accounted yet. @@ -2483,7 +2470,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) { unsigned long flags; struct rq *rq; - u64 ns = 0; + u64 ns; #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) /* @@ -2497,12 +2484,21 @@ unsigned long long task_sched_runtime(struct task_struct *p) * If we see ->on_cpu without ->on_rq, the task is leaving, and has * been accounted, so we're correct here as well. */ - if (!p->on_cpu || !p->on_rq) + if (!p->on_cpu || !task_on_rq_queued(p)) return p->se.sum_exec_runtime; #endif rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); + /* + * Must be ->curr _and_ ->on_rq. If dequeued, we would + * project cycles that may never be accounted to this + * thread, breaking clock_gettime(). + */ + if (task_current(rq, p) && task_on_rq_queued(p)) { + update_rq_clock(rq); + p->sched_class->update_curr(rq); + } + ns = p->se.sum_exec_runtime; task_rq_unlock(rq, p, &flags); return ns; @@ -2660,6 +2656,9 @@ static noinline void __schedule_bug(struct task_struct *prev) */ static inline void schedule_debug(struct task_struct *prev) { +#ifdef CONFIG_SCHED_STACK_END_CHECK + BUG_ON(unlikely(task_stack_end_corrupted(prev))); +#endif /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path. Otherwise whine @@ -2761,7 +2760,7 @@ need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_note_context_switch(cpu); + rcu_note_context_switch(); prev = rq->curr; schedule_debug(prev); @@ -2801,7 +2800,7 @@ need_resched: switch_count = &prev->nvcsw; } - if (prev->on_rq || rq->skip_clock_update < 0) + if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) update_rq_clock(rq); next = pick_next_task(rq, prev); @@ -2814,15 +2813,8 @@ need_resched: rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ - /* - * The context switch have flipped the stack from under us - * and restored the local variables which were saved when - * this task called schedule() in the past. prev == current - * is still correct, but it can be moved to another cpu/rq. - */ - cpu = smp_processor_id(); - rq = cpu_rq(cpu); + rq = context_switch(rq, prev, next); /* unlocks the rq */ + cpu = cpu_of(rq); } else raw_spin_unlock_irq(&rq->lock); @@ -2862,10 +2854,14 @@ asmlinkage __visible void __sched schedule_user(void) * or we have been woken up remotely but the IPI has not yet arrived, * we haven't yet exited the RCU idle mode. Do it here manually until * we find a better solution. + * + * NB: There are buggy callers of this function. Ideally we + * should warn if prev_state != IN_USER, but that will trigger + * too frequently to make sense yet. */ - user_exit(); + enum ctx_state prev_state = exception_enter(); schedule(); - user_enter(); + exception_exit(prev_state); } #endif @@ -2910,6 +2906,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); + +#ifdef CONFIG_CONTEXT_TRACKING +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +asmlinkage __visible void __sched notrace preempt_schedule_context(void) +{ + enum ctx_state prev_ctx; + + if (likely(!preemptible())) + return; + + do { + __preempt_count_add(PREEMPT_ACTIVE); + /* + * Needs preempt disabled in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + prev_ctx = exception_enter(); + __schedule(); + exception_exit(prev_ctx); + + __preempt_count_sub(PREEMPT_ACTIVE); + barrier(); + } while (need_resched()); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_CONTEXT_TRACKING */ + #endif /* CONFIG_PREEMPT */ /* @@ -2966,7 +3003,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, on_rq, running, enqueue_flag = 0; + int oldprio, queued, running, enqueue_flag = 0; struct rq *rq; const struct sched_class *prev_class; @@ -2995,12 +3032,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); /* * Boosting condition are: @@ -3037,7 +3074,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, p, enqueue_flag); check_class_changed(rq, p, prev_class, oldprio); @@ -3048,7 +3085,7 @@ out_unlock: void set_user_nice(struct task_struct *p, long nice) { - int old_prio, delta, on_rq; + int old_prio, delta, queued; unsigned long flags; struct rq *rq; @@ -3069,8 +3106,8 @@ void set_user_nice(struct task_struct *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - on_rq = p->on_rq; - if (on_rq) + queued = task_on_rq_queued(p); + if (queued) dequeue_task(rq, p, 0); p->static_prio = NICE_TO_PRIO(nice); @@ -3079,7 +3116,7 @@ void set_user_nice(struct task_struct *p, long nice) p->prio = effective_prio(p); delta = p->prio - old_prio; - if (on_rq) { + if (queued) { enqueue_task(rq, p, 0); /* * If the task increased its priority or is running and @@ -3351,7 +3388,7 @@ static int __sched_setscheduler(struct task_struct *p, { int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : MAX_RT_PRIO - 1 - attr->sched_priority; - int retval, oldprio, oldpolicy = -1, on_rq, running; + int retval, oldprio, oldpolicy = -1, queued, running; int policy = attr->sched_policy; unsigned long flags; const struct sched_class *prev_class; @@ -3548,19 +3585,19 @@ change: return 0; } - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); prev_class = p->sched_class; __setscheduler(rq, p, attr); if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (queued) { /* * We enqueue to tail when the priority of a task is * increased (user space view). @@ -3984,14 +4021,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); - goto out_unlock; + goto out_free_new_mask; } rcu_read_unlock(); } retval = security_task_setscheduler(p); if (retval) - goto out_unlock; + goto out_free_new_mask; cpuset_cpus_allowed(p, cpus_allowed); @@ -4004,13 +4041,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) * root_domain. */ #ifdef CONFIG_SMP - if (task_has_dl_policy(p)) { - const struct cpumask *span = task_rq(p)->rd->span; - - if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { + if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { + rcu_read_lock(); + if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { retval = -EBUSY; - goto out_unlock; + rcu_read_unlock(); + goto out_free_new_mask; } + rcu_read_unlock(); } #endif again: @@ -4028,7 +4066,7 @@ again: goto again; } } -out_unlock: +out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: free_cpumask_var(cpus_allowed); @@ -4489,8 +4527,10 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif + ppid = 0; rcu_read_lock(); - ppid = task_pid_nr(rcu_dereference(p->real_parent)); + if (pid_alive(p)) + ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, task_pid_nr(p), ppid, @@ -4512,7 +4552,7 @@ void show_state_filter(unsigned long state_filter) " task PC stack pid father\n"); #endif rcu_read_lock(); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: @@ -4520,7 +4560,7 @@ void show_state_filter(unsigned long state_filter) touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); - } while_each_thread(g, p); + } touch_all_softlockup_watchdogs(); @@ -4575,7 +4615,7 @@ void init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; - idle->on_rq = 1; + idle->on_rq = TASK_ON_RQ_QUEUED; #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif @@ -4595,7 +4635,109 @@ void init_idle(struct task_struct *idle, int cpu) #endif } +int cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial) +{ + int ret = 1, trial_cpus; + struct dl_bw *cur_dl_b; + unsigned long flags; + + rcu_read_lock_sched(); + cur_dl_b = dl_bw_of(cpumask_any(cur)); + trial_cpus = cpumask_weight(trial); + + raw_spin_lock_irqsave(&cur_dl_b->lock, flags); + if (cur_dl_b->bw != -1 && + cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + ret = 0; + raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); + rcu_read_unlock_sched(); + + return ret; +} + +int task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed) +{ + int ret = 0; + + /* + * Kthreads which disallow setaffinity shouldn't be moved + * to a new cpuset; we don't want to change their cpu + * affinity and isolating such threads by their set of + * allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_allowed may be changed. + */ + if (p->flags & PF_NO_SETAFFINITY) { + ret = -EINVAL; + goto out; + } + +#ifdef CONFIG_SMP + if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, + cs_cpus_allowed)) { + unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, + cs_cpus_allowed); + struct dl_bw *dl_b; + bool overflow; + int cpus; + unsigned long flags; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(dest_cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(dest_cpu); + overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + if (overflow) + ret = -EBUSY; + else { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw); + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + + } +#endif +out: + return ret; +} + #ifdef CONFIG_SMP +/* + * move_queued_task - move a queued task to new rq. + * + * Returns (locked) new rq. Old rq's lock is released. + */ +static struct rq *move_queued_task(struct task_struct *p, int new_cpu) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_held(&rq->lock); + + dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, new_cpu); + raw_spin_unlock(&rq->lock); + + rq = cpu_rq(new_cpu); + + raw_spin_lock(&rq->lock); + BUG_ON(task_cpu(p) != new_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; + enqueue_task(rq, p, 0); + check_preempt_curr(rq, p, 0); + + return rq; +} + void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { if (p->sched_class && p->sched_class->set_cpus_allowed) @@ -4652,14 +4794,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (p->on_rq) { + if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; - } + } else if (task_on_rq_queued(p)) + rq = move_queued_task(p, dest_cpu); out: task_rq_unlock(rq, p, &flags); @@ -4680,20 +4823,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); */ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { - struct rq *rq_dest, *rq_src; + struct rq *rq; int ret = 0; if (unlikely(!cpu_active(dest_cpu))) return ret; - rq_src = cpu_rq(src_cpu); - rq_dest = cpu_rq(dest_cpu); + rq = cpu_rq(src_cpu); raw_spin_lock(&p->pi_lock); - double_rq_lock(rq_src, rq_dest); + raw_spin_lock(&rq->lock); /* Already moved. */ if (task_cpu(p) != src_cpu) goto done; + /* Affinity changed (again). */ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) goto fail; @@ -4702,16 +4845,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ - if (p->on_rq) { - dequeue_task(rq_src, p, 0); - set_task_cpu(p, dest_cpu); - enqueue_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p, 0); - } + if (task_on_rq_queued(p)) + rq = move_queued_task(p, dest_cpu); done: ret = 1; fail: - double_rq_unlock(rq_src, rq_dest); + raw_spin_unlock(&rq->lock); raw_spin_unlock(&p->pi_lock); return ret; } @@ -4743,22 +4882,22 @@ void sched_setnuma(struct task_struct *p, int nid) { struct rq *rq; unsigned long flags; - bool on_rq, running; + bool queued, running; rq = task_rq_lock(p, &flags); - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); p->numa_preferred_nid = nid; if (running) p->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, p, 0); task_rq_unlock(rq, p, &flags); } @@ -4778,6 +4917,12 @@ static int migration_cpu_stop(void *data) * be on another cpu but it doesn't matter. */ local_irq_disable(); + /* + * We need to explicitly wake pending tasks before running + * __migrate_task() such that we will not miss enforcing cpus_allowed + * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. + */ + sched_ttwu_pending(); __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); local_irq_enable(); return 0; @@ -5188,6 +5333,7 @@ static int sched_cpu_inactive(struct notifier_block *nfb, { unsigned long flags; long cpu = (long)hcpu; + struct dl_bw *dl_b; switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: @@ -5195,15 +5341,19 @@ static int sched_cpu_inactive(struct notifier_block *nfb, /* explicitly allow suspend */ if (!(action & CPU_TASKS_FROZEN)) { - struct dl_bw *dl_b = dl_bw_of(cpu); bool overflow; int cpus; + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); cpus = dl_bw_cpus(cpu); overflow = __dl_overflow(dl_b, cpus, 0, 0); raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + if (overflow) return notifier_from_errno(-EBUSY); } @@ -5746,7 +5896,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered = sched_domains_tmpmask; struct sd_data *sdd = sd->private; - struct sched_domain *child; + struct sched_domain *sibling; int i; cpumask_clear(covered); @@ -5757,10 +5907,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (cpumask_test_cpu(i, covered)) continue; - child = *per_cpu_ptr(sdd->sd, i); + sibling = *per_cpu_ptr(sdd->sd, i); /* See the comment near build_group_mask(). */ - if (!cpumask_test_cpu(i, sched_domain_span(child))) + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), @@ -5770,10 +5920,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) goto fail; sg_span = sched_group_cpus(sg); - if (child->child) { - child = child->child; - cpumask_copy(sg_span, sched_domain_span(child)); - } else + if (sibling->child) + cpumask_copy(sg_span, sched_domain_span(sibling->child)); + else cpumask_set_cpu(i, sg_span); cpumask_or(covered, covered, sg_span); @@ -6011,7 +6160,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA static int sched_domains_numa_levels; +enum numa_topology_type sched_numa_topology_type; static int *sched_domains_numa_distance; +int sched_max_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; #endif @@ -6183,7 +6334,7 @@ static void sched_numa_warn(const char *str) printk(KERN_WARNING "\n"); } -static bool find_numa_distance(int distance) +bool find_numa_distance(int distance) { int i; @@ -6198,6 +6349,56 @@ static bool find_numa_distance(int distance) return false; } +/* + * A system can have three types of NUMA topology: + * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system + * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes + * NUMA_BACKPLANE: nodes can reach other nodes through a backplane + * + * The difference between a glueless mesh topology and a backplane + * topology lies in whether communication between not directly + * connected nodes goes through intermediary nodes (where programs + * could run), or through backplane controllers. This affects + * placement of programs. + * + * The type of topology can be discerned with the following tests: + * - If the maximum distance between any nodes is 1 hop, the system + * is directly connected. + * - If for two nodes A and B, located N > 1 hops away from each other, + * there is an intermediary node C, which is < N hops away from both + * nodes A and B, the system is a glueless mesh. + */ +static void init_numa_topology_type(void) +{ + int a, b, c, n; + + n = sched_max_numa_distance; + + if (n <= 1) + sched_numa_topology_type = NUMA_DIRECT; + + for_each_online_node(a) { + for_each_online_node(b) { + /* Find two nodes furthest removed from each other. */ + if (node_distance(a, b) < n) + continue; + + /* Is there an intermediary node between a and b? */ + for_each_online_node(c) { + if (node_distance(a, c) < n && + node_distance(b, c) < n) { + sched_numa_topology_type = + NUMA_GLUELESS_MESH; + return; + } + } + + sched_numa_topology_type = NUMA_BACKPLANE; + return; + } + } +} + static void sched_init_numa(void) { int next_distance, curr_distance = node_distance(0, 0); @@ -6251,6 +6452,10 @@ static void sched_init_numa(void) if (!sched_debug()) break; } + + if (!level) + return; + /* * 'level' contains the number of unique distances, excluding the * identity distance node_distance(i,i). @@ -6330,6 +6535,9 @@ static void sched_init_numa(void) sched_domain_topology = tl; sched_domains_numa_levels = level; + sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + + init_numa_topology_type(); } static void sched_domains_numa_masks_set(int cpu) @@ -6905,9 +7113,6 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif -#ifdef CONFIG_CPUMASK_OFFSTACK - alloc_size += num_possible_cpus() * cpumask_size(); -#endif if (alloc_size) { ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); @@ -6927,13 +7132,13 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_RT_GROUP_SCHED */ + } #ifdef CONFIG_CPUMASK_OFFSTACK - for_each_possible_cpu(i) { - per_cpu(load_balance_mask, i) = (void *)ptr; - ptr += cpumask_size(); - } -#endif /* CONFIG_CPUMASK_OFFSTACK */ + for_each_possible_cpu(i) { + per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); } +#endif /* CONFIG_CPUMASK_OFFSTACK */ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); @@ -7082,6 +7287,25 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { + /* + * Blocking primitives will set (and therefore destroy) current->state, + * since we will exit with TASK_RUNNING make sure we enter with it, + * otherwise we will destroy state. + */ + if (WARN_ONCE(current->state != TASK_RUNNING, + "do not call blocking ops when !TASK_RUNNING; " + "state=%lx set at [<%p>] %pS\n", + current->state, + (void *)current->task_state_change, + (void *)current->task_state_change)) + __set_current_state(TASK_RUNNING); + + ___might_sleep(file, line, preempt_offset); +} +EXPORT_SYMBOL(__might_sleep); + +void ___might_sleep(const char *file, int line, int preempt_offset) +{ static unsigned long prev_jiffy; /* ratelimiting */ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ @@ -7113,7 +7337,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) #endif dump_stack(); } -EXPORT_SYMBOL(__might_sleep); +EXPORT_SYMBOL(___might_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -7124,13 +7348,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p) .sched_policy = SCHED_NORMAL, }; int old_prio = p->prio; - int on_rq; + int queued; - on_rq = p->on_rq; - if (on_rq) + queued = task_on_rq_queued(p); + if (queued) dequeue_task(rq, p, 0); __setscheduler(rq, p, &attr); - if (on_rq) { + if (queued) { enqueue_task(rq, p, 0); resched_curr(rq); } @@ -7144,12 +7368,12 @@ void normalize_rt_tasks(void) unsigned long flags; struct rq *rq; - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, p) { + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { /* * Only normalize user tasks: */ - if (!p->mm) + if (p->flags & PF_KTHREAD) continue; p->se.exec_start = 0; @@ -7164,21 +7388,16 @@ void normalize_rt_tasks(void) * Renice negative nice level userspace * tasks back to 0: */ - if (task_nice(p) < 0 && p->mm) + if (task_nice(p) < 0) set_user_nice(p, 0); continue; } - raw_spin_lock(&p->pi_lock); - rq = __task_rq_lock(p); - + rq = task_rq_lock(p, &flags); normalize_task(rq, p); - - __task_rq_unlock(rq); - raw_spin_unlock(&p->pi_lock); - } while_each_thread(g, p); - - read_unlock_irqrestore(&tasklist_lock, flags); + task_rq_unlock(rq, p, &flags); + } + read_unlock(&tasklist_lock); } #endif /* CONFIG_MAGIC_SYSRQ */ @@ -7318,36 +7537,40 @@ void sched_offline_group(struct task_group *tg) void sched_move_task(struct task_struct *tsk) { struct task_group *tg; - int on_rq, running; + int queued, running; unsigned long flags; struct rq *rq; rq = task_rq_lock(tsk, &flags); running = task_current(rq, tsk); - on_rq = tsk->on_rq; + queued = task_on_rq_queued(tsk); - if (on_rq) + if (queued) dequeue_task(rq, tsk, 0); if (unlikely(running)) - tsk->sched_class->put_prev_task(rq, tsk); + put_prev_task(rq, tsk); - tg = container_of(task_css_check(tsk, cpu_cgrp_id, - lockdep_is_held(&tsk->sighand->siglock)), + /* + * All callers are synchronized by task_rq_lock(); we do not use RCU + * which is pointless here. Thus, we pass "true" to task_css_check() + * to prevent lockdep warnings. + */ + tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group, css); tg = autogroup_task_group(tsk, tg); tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk, on_rq); + tsk->sched_class->task_move_group(tsk, queued); else #endif set_task_rq(tsk, task_cpu(tsk)); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, tsk, 0); task_rq_unlock(rq, tsk, &flags); @@ -7365,10 +7588,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg) { struct task_struct *g, *p; - do_each_thread(g, p) { - if (rt_task(p) && task_rq(p)->rt.tg == tg) + for_each_process_thread(g, p) { + if (rt_task(p) && task_group(p) == tg) return 1; - } while_each_thread(g, p); + } return 0; } @@ -7577,6 +7800,7 @@ static int sched_dl_global_constraints(void) u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); + struct dl_bw *dl_b; int cpu, ret = 0; unsigned long flags; @@ -7590,13 +7814,16 @@ static int sched_dl_global_constraints(void) * solutions is welcome! */ for_each_possible_cpu(cpu) { - struct dl_bw *dl_b = dl_bw_of(cpu); + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); if (new_bw < dl_b->total_bw) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + if (ret) break; } @@ -7607,6 +7834,7 @@ static int sched_dl_global_constraints(void) static void sched_dl_do_global(void) { u64 new_bw = -1; + struct dl_bw *dl_b; int cpu; unsigned long flags; @@ -7620,11 +7848,14 @@ static void sched_dl_do_global(void) * FIXME: As above... */ for_each_possible_cpu(cpu) { - struct dl_bw *dl_b = dl_bw_of(cpu); + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); dl_b->bw = new_bw; raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); } } @@ -7754,6 +7985,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } +static void cpu_cgroup_fork(struct task_struct *task) +{ + sched_move_task(task); +} + static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { @@ -8005,7 +8241,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; quota = normalize_cfs_quota(tg, d); - parent_quota = parent_b->hierarchal_quota; + parent_quota = parent_b->hierarchical_quota; /* * ensure max(child_quota) <= parent_quota, inherit when no @@ -8016,7 +8252,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) else if (parent_quota != RUNTIME_INF && quota > parent_quota) return -EINVAL; } - cfs_b->hierarchal_quota = quota; + cfs_b->hierarchical_quota = quota; return 0; } @@ -8126,6 +8362,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, .css_offline = cpu_cgroup_css_offline, + .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index bd95963dae80..539ca3ce071b 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -107,9 +107,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, int best_cpu = -1; const struct sched_dl_entity *dl_se = &p->dl; - if (later_mask && cpumask_and(later_mask, cp->free_cpus, - &p->cpus_allowed) && cpumask_and(later_mask, - later_mask, cpu_active_mask)) { + if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { best_cpu = cpumask_any(later_mask); goto out; } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 538c9796ad4a..020039bd1326 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); int cpudl_init(struct cpudl *cp); void cpudl_cleanup(struct cpudl *cp); -#else -#define cpudl_set(cp, cpu, dl) do { } while (0) -#define cpudl_init() do { } while (0) #endif /* CONFIG_SMP */ #endif /* _LINUX_CPUDL_H */ diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 6b033347fdfd..63cbb9ca0496 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp, void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); -#else -#define cpupri_set(cp, cpu, pri) do { } while (0) -#define cpupri_init() do { } while (0) #endif #endif /* _LINUX_CPUPRI_H */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 72fdf06ef865..8394b1ee600c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) struct signal_struct *sig = tsk->signal; cputime_t utime, stime; struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; + unsigned int seq, nextseq; + unsigned long flags; rcu_read_lock(); - /* make sure we can trust tsk->thread_group list */ - if (!likely(pid_alive(tsk))) - goto out; - - t = tsk; + /* Attempt a lockless read on the first round. */ + nextseq = 0; do { - task_cputime(t, &utime, &stime); - times->utime += utime; - times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); - } while_each_thread(tsk, t); -out: + seq = nextseq; + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + for_each_thread(tsk, t) { + task_cputime(t, &utime, &stime); + times->utime += utime; + times->stime += stime; + times->sum_exec_runtime += task_sched_runtime(t); + } + /* If lockless access failed, take the lock. */ + nextseq = 1; + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); rcu_read_unlock(); } @@ -550,6 +555,23 @@ drop_precision: } /* + * Atomically advance counter to the new value. Interrupts, vcpu + * scheduling, and scaling inaccuracies can cause cputime_advance + * to be occasionally called with a new value smaller than counter. + * Let's enforce atomicity. + * + * Normally a caller will only go through this loop once, or not + * at all in case a previous caller updated counter the same jiffy. + */ +static void cputime_advance(cputime_t *counter, cputime_t new) +{ + cputime_t old; + + while (new > (old = ACCESS_ONCE(*counter))) + cmpxchg_cputime(counter, old, new); +} + +/* * Adjust tick based cputime random precision against scheduler * runtime accounting. */ @@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr, utime = rtime - stime; } - /* - * If the tick based count grows faster than the scheduler one, - * the result of the scaling may go backward. - * Let's enforce monotonicity. - */ - prev->stime = max(prev->stime, stime); - prev->utime = max(prev->utime, utime); + cputime_advance(&prev->stime, stime); + cputime_advance(&prev->utime, utime); out: *ut = prev->utime; @@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) cputime_adjust(&cputime, &p->prev_cputime, ut, st); } -/* - * Must be called with siglock held. - */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 255ce138b652..b52092f2636d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -518,21 +518,29 @@ again: } /* - * We need to take care of a possible races here. In fact, the - * task might have changed its scheduling policy to something - * different from SCHED_DEADLINE or changed its reservation - * parameters (through sched_setattr()). + * We need to take care of several possible races here: + * + * - the task might have changed its scheduling policy + * to something different than SCHED_DEADLINE + * - the task might have changed its reservation parameters + * (through sched_setattr()) + * - the task might have been boosted by someone else and + * might be in the boosting/deboosting path + * + * In all this cases we bail out, as the task is already + * in the runqueue or is going to be enqueued back anyway. */ - if (!dl_task(p) || dl_se->dl_new) + if (!dl_task(p) || dl_se->dl_new || + dl_se->dl_boosted || !dl_se->dl_throttled) goto unlock; sched_clock_tick(); update_rq_clock(rq); dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; - if (p->on_rq) { + if (task_on_rq_queued(p)) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (task_has_dl_policy(rq->curr)) + if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); @@ -555,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - if (hrtimer_active(timer)) { - hrtimer_try_to_cancel(timer); - return; - } - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); timer->function = dl_task_timer; } @@ -567,24 +570,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) static int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) { - int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); - int rorun = dl_se->runtime <= 0; - - if (!rorun && !dmiss) - return 0; - - /* - * If we are beyond our current deadline and we are still - * executing, then we have already used some of the runtime of - * the next instance. Thus, if we do not account that, we are - * stealing bandwidth from the system at each deadline miss! - */ - if (dmiss) { - dl_se->runtime = rorun ? dl_se->runtime : 0; - dl_se->runtime -= rq_clock(rq) - dl_se->deadline; - } - - return 1; + return (dl_se->runtime <= 0); } extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); @@ -625,7 +611,7 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= delta_exec; + dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(rq, dl_se)) { __dequeue_task_dl(rq, curr, 0); if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) @@ -823,10 +809,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) - replenish_dl_entity(dl_se, pi_se); - else + if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) update_dl_entity(dl_se, pi_se); + else if (flags & ENQUEUE_REPLENISH) + replenish_dl_entity(dl_se, pi_se); __enqueue_dl_entity(dl_se); } @@ -847,8 +833,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * smaller than our one... OTW we keep our runtime and * deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) + if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { pi_se = &pi_task->dl; + } else if (!dl_prio(p->normal_prio)) { + /* + * Special case in which we have a !SCHED_DEADLINE task + * that is going to be deboosted, but exceedes its + * runtime while doing so. No point in replenishing + * it, as it's going to return back to its original + * scheduling class after this. + */ + BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + return; + } /* * If p is throttled, we do nothing. In fact, if it exhausted @@ -914,7 +911,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) struct task_struct *curr; struct rq *rq; - if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + if (sd_flag != SD_BALANCE_WAKE) goto out; rq = cpu_rq(cpu); @@ -997,10 +994,11 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, #ifdef CONFIG_SCHED_HRTICK static void start_hrtick_dl(struct rq *rq, struct task_struct *p) { - s64 delta = p->dl.dl_runtime - p->dl.runtime; - - if (delta > 10000) - hrtick_start(rq, p->dl.runtime); + hrtick_start(rq, p->dl.runtime); +} +#else /* !CONFIG_SCHED_HRTICK */ +static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +{ } #endif @@ -1030,7 +1028,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) * means a stop task can slip in, in which case we need to * re-start task selection. */ - if (rq->stop && rq->stop->on_rq) + if (rq->stop && task_on_rq_queued(rq->stop)) return RETRY_TASK; } @@ -1055,10 +1053,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) /* Running task will never be pushed. */ dequeue_pushable_dl_task(rq, p); -#ifdef CONFIG_SCHED_HRTICK if (hrtick_enabled(rq)) start_hrtick_dl(rq, p); -#endif set_post_schedule(rq); @@ -1077,10 +1073,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); -#ifdef CONFIG_SCHED_HRTICK if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) start_hrtick_dl(rq, p); -#endif } static void task_fork_dl(struct task_struct *p) @@ -1124,10 +1118,8 @@ static void set_curr_task_dl(struct rq *rq) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && - (p->nr_cpus_allowed > 1)) + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) return 1; - return 0; } @@ -1158,7 +1150,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); static int find_later_rq(struct task_struct *task) { struct sched_domain *sd; - struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); + struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); int this_cpu = smp_processor_id(); int best_cpu, cpu = task_cpu(task); @@ -1169,6 +1161,13 @@ static int find_later_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; + /* + * We have to consider system topology and task affinity + * first, then we can look for a suitable cpu. + */ + cpumask_copy(later_mask, task_rq(task)->rd->span); + cpumask_and(later_mask, later_mask, cpu_active_mask); + cpumask_and(later_mask, later_mask, &task->cpus_allowed); best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask); if (best_cpu == -1) @@ -1257,7 +1256,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || - task_running(rq, task) || !task->on_rq)) { + task_running(rq, task) || + !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); later_rq = NULL; break; @@ -1296,7 +1296,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); - BUG_ON(!p->on_rq); + BUG_ON(!task_on_rq_queued(p)); BUG_ON(!dl_task(p)); return p; @@ -1311,6 +1311,7 @@ static int push_dl_task(struct rq *rq) { struct task_struct *next_task; struct rq *later_rq; + int ret = 0; if (!rq->dl.overloaded) return 0; @@ -1356,7 +1357,6 @@ retry: * The task is still there. We don't try * again, some other cpu will pull it when ready. */ - dequeue_pushable_dl_task(rq, next_task); goto out; } @@ -1372,6 +1372,7 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, later_rq->cpu); activate_task(later_rq, next_task, 0); + ret = 1; resched_curr(later_rq); @@ -1380,7 +1381,7 @@ retry: out: put_task_struct(next_task); - return 1; + return ret; } static void push_dl_tasks(struct rq *rq) @@ -1443,7 +1444,7 @@ static int pull_dl_task(struct rq *this_rq) dl_time_before(p->dl.deadline, this_rq->dl.earliest_dl.curr))) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->on_rq); + WARN_ON(!task_on_rq_queued(p)); /* * Then we pull iff p has actually an earlier @@ -1486,7 +1487,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) p->nr_cpus_allowed > 1 && dl_task(rq->curr) && (rq->curr->nr_cpus_allowed < 2 || - dl_entity_preempt(&rq->curr->dl, &p->dl))) { + !dl_entity_preempt(&p->dl, &rq->curr->dl))) { push_dl_tasks(rq); } } @@ -1495,10 +1496,33 @@ static void set_cpus_allowed_dl(struct task_struct *p, const struct cpumask *new_mask) { struct rq *rq; + struct root_domain *src_rd; int weight; BUG_ON(!dl_task(p)); + rq = task_rq(p); + src_rd = rq->rd; + /* + * Migrating a SCHED_DEADLINE task between exclusive + * cpusets (different root_domains) entails a bandwidth + * update. We already made space for us in the destination + * domain (see cpuset_can_attach()). + */ + if (!cpumask_intersects(src_rd->span, new_mask)) { + struct dl_bw *src_dl_b; + + src_dl_b = dl_bw_of(cpu_of(rq)); + /* + * We now free resources of the root_domain we are migrating + * off. In the worst case, sched_setattr() may temporary fail + * until we complete the update. + */ + raw_spin_lock(&src_dl_b->lock); + __dl_clear(src_dl_b, p->dl.dl_bw); + raw_spin_unlock(&src_dl_b->lock); + } + /* * Update only if the task is actually running (i.e., * it is on the rq AND it is not throttled). @@ -1515,8 +1539,6 @@ static void set_cpus_allowed_dl(struct task_struct *p, if ((p->nr_cpus_allowed > 1) == (weight > 1)) return; - rq = task_rq(p); - /* * The process used to be able to migrate OR it can now migrate */ @@ -1564,20 +1586,48 @@ void init_sched_dl_class(void) #endif /* CONFIG_SMP */ +/* + * Ensure p's dl_timer is cancelled. May drop rq->lock for a while. + */ +static void cancel_dl_timer(struct rq *rq, struct task_struct *p) +{ + struct hrtimer *dl_timer = &p->dl.dl_timer; + + /* Nobody will change task's class if pi_lock is held */ + lockdep_assert_held(&p->pi_lock); + + if (hrtimer_active(dl_timer)) { + int ret = hrtimer_try_to_cancel(dl_timer); + + if (unlikely(ret == -1)) { + /* + * Note, p may migrate OR new deadline tasks + * may appear in rq when we are unlocking it. + * A caller of us must be fine with that. + */ + raw_spin_unlock(&rq->lock); + hrtimer_cancel(dl_timer); + raw_spin_lock(&rq->lock); + } + } +} + static void switched_from_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) - hrtimer_try_to_cancel(&p->dl.dl_timer); + cancel_dl_timer(rq, p); + + __dl_clear_params(p); -#ifdef CONFIG_SMP /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one * from an overloaded cpu, if any. */ - if (!rq->dl.dl_nr_running) - pull_dl_task(rq); -#endif + if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) + return; + + if (pull_dl_task(rq)) + resched_curr(rq); } /* @@ -1596,14 +1646,19 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (unlikely(p->dl.dl_throttled)) return; - if (p->on_rq && rq->curr != p) { + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) + if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && + push_dl_task(rq) && rq != task_rq(p)) /* Only reschedule if pushing failed */ check_resched = 0; #endif /* CONFIG_SMP */ - if (check_resched && task_has_dl_policy(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + if (check_resched) { + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); + } } } @@ -1614,7 +1669,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (p->on_rq || rq->curr == p) { + if (task_on_rq_queued(p) || rq->curr == p) { #ifdef CONFIG_SMP /* * This might be too much, but unfortunately @@ -1673,4 +1728,15 @@ const struct sched_class dl_sched_class = { .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, .switched_to = switched_to_dl, + + .update_curr = update_curr_dl, }; + +#ifdef CONFIG_SCHED_DEBUG +extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); + +void print_dl_stats(struct seq_file *m, int cpu) +{ + print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); +} +#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 627b3c34b821..92cc52001e74 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -150,7 +150,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) { struct task_struct *g, *p; - unsigned long flags; SEQ_printf(m, "\nrunnable tasks:\n" @@ -159,16 +158,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) "------------------------------------------------------" "----------------------------------------------------\n"); - read_lock_irqsave(&tasklist_lock, flags); - - do_each_thread(g, p) { + rcu_read_lock(); + for_each_process_thread(g, p) { if (task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); - } while_each_thread(g, p); - - read_unlock_irqrestore(&tasklist_lock, flags); + } + rcu_read_unlock(); } void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) @@ -264,6 +261,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) #undef P } +void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) +{ + SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); + SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); +} + extern __read_mostly int sched_clock_running; static void print_cpu(struct seq_file *m, int cpu) @@ -332,10 +335,9 @@ do { \ spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); + print_dl_stats(m, cpu); - rcu_read_lock(); print_rq(m, rq, cpu); - rcu_read_unlock(); spin_unlock_irqrestore(&sched_debug_lock, flags); SEQ_printf(m, "\n"); } @@ -533,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) unsigned long nr_faults = -1; int cpu_current, home_node; - if (p->numa_faults_memory) - nr_faults = p->numa_faults_memory[2*node + i]; + if (p->numa_faults) + nr_faults = p->numa_faults[2*node + i]; cpu_current = !i ? (task_node(p) == node) : (pol && node_isset(node, pol->v.nodes)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bfa3c86d0d68..40667cbf371b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -23,6 +23,7 @@ #include <linux/latencytop.h> #include <linux/sched.h> #include <linux/cpumask.h> +#include <linux/cpuidle.h> #include <linux/slab.h> #include <linux/profile.h> #include <linux/interrupt.h> @@ -665,6 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP +static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); static inline void __update_task_entity_contrib(struct sched_entity *se); @@ -724,6 +726,11 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); } +static void update_curr_fair(struct rq *rq) +{ + update_curr(cfs_rq_of(&rq->curr->se)); +} + static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -826,11 +833,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) static unsigned int task_scan_min(struct task_struct *p) { + unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); unsigned int scan, floor; unsigned int windows = 1; - if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) - windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; + if (scan_size < MAX_SCAN_WINDOW) + windows = MAX_SCAN_WINDOW / scan_size; floor = 1000 / windows; scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); @@ -865,7 +873,6 @@ struct numa_group { spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; pid_t gid; - struct list_head task_list; struct rcu_head rcu; nodemask_t active_nodes; @@ -893,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p) return p->numa_group ? p->numa_group->gid : 0; } -static inline int task_faults_idx(int nid, int priv) +/* + * The averaged statistics, shared & private, memory & cpu, + * occupy the first half of the array. The second half of the + * array is for current counters, which are averaged into the + * first set by task_numa_placement. + */ +static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv) { - return NR_NUMA_HINT_FAULT_TYPES * nid + priv; + return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv; } static inline unsigned long task_faults(struct task_struct *p, int nid) { - if (!p->numa_faults_memory) + if (!p->numa_faults) return 0; - return p->numa_faults_memory[task_faults_idx(nid, 0)] + - p->numa_faults_memory[task_faults_idx(nid, 1)]; + return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] + + p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults(struct task_struct *p, int nid) @@ -912,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) if (!p->numa_group) return 0; - return p->numa_group->faults[task_faults_idx(nid, 0)] + - p->numa_group->faults[task_faults_idx(nid, 1)]; + return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + + p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) { - return group->faults_cpu[task_faults_idx(nid, 0)] + - group->faults_cpu[task_faults_idx(nid, 1)]; + return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + + group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; +} + +/* Handle placement on systems where not all nodes are directly connected. */ +static unsigned long score_nearby_nodes(struct task_struct *p, int nid, + int maxdist, bool task) +{ + unsigned long score = 0; + int node; + + /* + * All nodes are directly connected, and the same distance + * from each other. No need for fancy placement algorithms. + */ + if (sched_numa_topology_type == NUMA_DIRECT) + return 0; + + /* + * This code is called for each node, introducing N^2 complexity, + * which should be ok given the number of nodes rarely exceeds 8. + */ + for_each_online_node(node) { + unsigned long faults; + int dist = node_distance(nid, node); + + /* + * The furthest away nodes in the system are not interesting + * for placement; nid was already counted. + */ + if (dist == sched_max_numa_distance || node == nid) + continue; + + /* + * On systems with a backplane NUMA topology, compare groups + * of nodes, and move tasks towards the group with the most + * memory accesses. When comparing two nodes at distance + * "hoplimit", only nodes closer by than "hoplimit" are part + * of each group. Skip other nodes. + */ + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist > maxdist) + continue; + + /* Add up the faults from nearby nodes. */ + if (task) + faults = task_faults(p, node); + else + faults = group_faults(p, node); + + /* + * On systems with a glueless mesh NUMA topology, there are + * no fixed "groups of nodes". Instead, nodes that are not + * directly connected bounce traffic through intermediate + * nodes; a numa_group can occupy any set of nodes. + * The further away a node is, the less the faults count. + * This seems to result in good task placement. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + faults *= (sched_max_numa_distance - dist); + faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + } + + score += faults; + } + + return score; } /* @@ -928,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) * larger multiplier, in order to group tasks together that are almost * evenly spread out between numa nodes. */ -static inline unsigned long task_weight(struct task_struct *p, int nid) +static inline unsigned long task_weight(struct task_struct *p, int nid, + int dist) { - unsigned long total_faults; + unsigned long faults, total_faults; - if (!p->numa_faults_memory) + if (!p->numa_faults) return 0; total_faults = p->total_numa_faults; @@ -940,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) if (!total_faults) return 0; - return 1000 * task_faults(p, nid) / total_faults; + faults = task_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, true); + + return 1000 * faults / total_faults; } -static inline unsigned long group_weight(struct task_struct *p, int nid) +static inline unsigned long group_weight(struct task_struct *p, int nid, + int dist) { - if (!p->numa_group || !p->numa_group->total_faults) + unsigned long faults, total_faults; + + if (!p->numa_group) return 0; - return 1000 * group_faults(p, nid) / p->numa_group->total_faults; + total_faults = p->numa_group->total_faults; + + if (!total_faults) + return 0; + + faults = group_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, false); + + return 1000 * faults / total_faults; } bool should_numa_migrate_memory(struct task_struct *p, struct page * page, @@ -1038,7 +1131,8 @@ struct numa_stats { */ static void update_numa_stats(struct numa_stats *ns, int nid) { - int cpu, cpus = 0; + int smt, cpu, cpus = 0; + unsigned long capacity; memset(ns, 0, sizeof(*ns)); for_each_cpu(cpu, cpumask_of_node(nid)) { @@ -1062,8 +1156,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->task_capacity = - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); + /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ + smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); + capacity = cpus / smt; /* cores */ + + ns->task_capacity = min_t(unsigned, capacity, + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } @@ -1076,6 +1174,7 @@ struct task_numa_env { struct numa_stats src_stats, dst_stats; int imbalance_pct; + int dist; struct task_struct *best_task; long best_imp; @@ -1155,11 +1254,29 @@ static void task_numa_compare(struct task_numa_env *env, long load; long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; + int dist = env->dist; rcu_read_lock(); - cur = ACCESS_ONCE(dst_rq->curr); - if (cur->pid == 0) /* idle */ + + raw_spin_lock_irq(&dst_rq->lock); + cur = dst_rq->curr; + /* + * No need to move the exiting task, and this ensures that ->curr + * wasn't reaped and thus get_task_struct() in task_numa_assign() + * is safe under RCU read lock. + * Note that rcu_read_lock() itself can't protect from the final + * put_task_struct() after the last schedule(). + */ + if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + raw_spin_unlock_irq(&dst_rq->lock); + + /* + * Because we have preemption enabled we can get migrated around and + * end try selecting ourselves (current == env->p) as a swap candidate. + */ + if (cur == env->p) + goto unlock; /* * "imp" is the fault differential for the source task between the @@ -1178,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env, * in any group then look only at task weights. */ if (cur->numa_group == env->p->numa_group) { - imp = taskimp + task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp = taskimp + task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); /* * Add some hysteresis to prevent swapping the * tasks within a group over tiny differences. @@ -1193,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env, * instead. */ if (cur->numa_group) - imp += group_weight(cur, env->src_nid) - - group_weight(cur, env->dst_nid); + imp += group_weight(cur, env->src_nid, dist) - + group_weight(cur, env->dst_nid, dist); else - imp += task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp += task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); } } @@ -1206,7 +1323,7 @@ static void task_numa_compare(struct task_numa_env *env, if (!cur) { /* Is there capacity at our destination? */ - if (env->src_stats.has_free_capacity && + if (env->src_stats.nr_running <= env->src_stats.task_capacity && !env->dst_stats.has_free_capacity) goto unlock; @@ -1252,6 +1369,13 @@ balance: if (load_too_imbalanced(src_load, dst_load, env)) goto unlock; + /* + * One idle CPU per node is evaluated for a task numa move. + * Call select_idle_sibling to maybe find a better one. + */ + if (!cur) + env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + assign: task_numa_assign(env, cur, imp); unlock: @@ -1289,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p) }; struct sched_domain *sd; unsigned long taskweight, groupweight; - int nid, ret; + int nid, ret, dist; long taskimp, groupimp; /* @@ -1317,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p) return -EINVAL; } - taskweight = task_weight(p, env.src_nid); - groupweight = group_weight(p, env.src_nid); - update_numa_stats(&env.src_stats, env.src_nid); env.dst_nid = p->numa_preferred_nid; - taskimp = task_weight(p, env.dst_nid) - taskweight; - groupimp = group_weight(p, env.dst_nid) - groupweight; + dist = env.dist = node_distance(env.src_nid, env.dst_nid); + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + update_numa_stats(&env.src_stats, env.src_nid); + taskimp = task_weight(p, env.dst_nid, dist) - taskweight; + groupimp = group_weight(p, env.dst_nid, dist) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); /* Try to find a spot on the preferred nid. */ task_numa_find_cpu(&env, taskimp, groupimp); - /* No space available on the preferred nid. Look elsewhere. */ - if (env.best_cpu == -1) { + /* + * Look at other nodes in these cases: + * - there is no space available on the preferred_nid + * - the task is part of a numa_group that is interleaved across + * multiple NUMA nodes; in order to better consolidate the group, + * we need to check other locations. + */ + if (env.best_cpu == -1 || (p->numa_group && + nodes_weight(p->numa_group->active_nodes) > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; + dist = node_distance(env.src_nid, env.dst_nid); + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist != env.dist) { + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + } + /* Only consider nodes where both task and groups benefit */ - taskimp = task_weight(p, nid) - taskweight; - groupimp = group_weight(p, nid) - groupweight; + taskimp = task_weight(p, nid, dist) - taskweight; + groupimp = group_weight(p, nid, dist) - groupweight; if (taskimp < 0 && groupimp < 0) continue; + env.dist = dist; env.dst_nid = nid; update_numa_stats(&env.dst_stats, env.dst_nid); task_numa_find_cpu(&env, taskimp, groupimp); @@ -1394,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p) unsigned long interval = HZ; /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) return; /* Periodically retry migrating the task to the preferred node */ @@ -1506,7 +1646,7 @@ static void update_task_scan_period(struct task_struct *p, * scanning faster if shared accesses dominate as it may * simply bounce migrations uselessly */ - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); + ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); diff = (diff * ratio) / NUMA_PERIOD_SLOTS; } @@ -1543,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) return delta; } +/* + * Determine the preferred nid for a task in a numa_group. This needs to + * be done in a way that produces consistent results with group_weight, + * otherwise workloads might not converge. + */ +static int preferred_group_nid(struct task_struct *p, int nid) +{ + nodemask_t nodes; + int dist; + + /* Direct connections between all NUMA nodes. */ + if (sched_numa_topology_type == NUMA_DIRECT) + return nid; + + /* + * On a system with glueless mesh NUMA topology, group_weight + * scores nodes according to the number of NUMA hinting faults on + * both the node itself, and on nearby nodes. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + unsigned long score, max_score = 0; + int node, max_node = nid; + + dist = sched_max_numa_distance; + + for_each_online_node(node) { + score = group_weight(p, node, dist); + if (score > max_score) { + max_score = score; + max_node = node; + } + } + return max_node; + } + + /* + * Finding the preferred nid in a system with NUMA backplane + * interconnect topology is more involved. The goal is to locate + * tasks from numa_groups near each other in the system, and + * untangle workloads from different sides of the system. This requires + * searching down the hierarchy of node groups, recursively searching + * inside the highest scoring group of nodes. The nodemask tricks + * keep the complexity of the search down. + */ + nodes = node_online_map; + for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { + unsigned long max_faults = 0; + nodemask_t max_group; + int a, b; + + /* Are there nodes at this distance from each other? */ + if (!find_numa_distance(dist)) + continue; + + for_each_node_mask(a, nodes) { + unsigned long faults = 0; + nodemask_t this_group; + nodes_clear(this_group); + + /* Sum group's NUMA faults; includes a==b case. */ + for_each_node_mask(b, nodes) { + if (node_distance(a, b) < dist) { + faults += group_faults(p, b); + node_set(b, this_group); + node_clear(b, nodes); + } + } + + /* Remember the top group. */ + if (faults > max_faults) { + max_faults = faults; + max_group = this_group; + /* + * subtle: at the smallest distance there is + * just one node left in each "group", the + * winner is the preferred nid. + */ + nid = a; + } + } + /* Next round, evaluate the nodes within max_group. */ + nodes = max_group; + } + return nid; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; @@ -1570,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p) /* Find the node with the highest number of faults */ for_each_online_node(nid) { + /* Keep track of the offsets in numa_faults array */ + int mem_idx, membuf_idx, cpu_idx, cpubuf_idx; unsigned long faults = 0, group_faults = 0; - int priv, i; + int priv; for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { long diff, f_diff, f_weight; - i = task_faults_idx(nid, priv); + mem_idx = task_faults_idx(NUMA_MEM, nid, priv); + membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv); + cpu_idx = task_faults_idx(NUMA_CPU, nid, priv); + cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv); /* Decay existing window, copy faults since last scan */ - diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; - fault_types[priv] += p->numa_faults_buffer_memory[i]; - p->numa_faults_buffer_memory[i] = 0; + diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2; + fault_types[priv] += p->numa_faults[membuf_idx]; + p->numa_faults[membuf_idx] = 0; /* * Normalize the faults_from, so all tasks in a group @@ -1591,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p) * faults are less important. */ f_weight = div64_u64(runtime << 16, period + 1); - f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / + f_weight = (f_weight * p->numa_faults[cpubuf_idx]) / (total_faults + 1); - f_diff = f_weight - p->numa_faults_cpu[i] / 2; - p->numa_faults_buffer_cpu[i] = 0; + f_diff = f_weight - p->numa_faults[cpu_idx] / 2; + p->numa_faults[cpubuf_idx] = 0; - p->numa_faults_memory[i] += diff; - p->numa_faults_cpu[i] += f_diff; - faults += p->numa_faults_memory[i]; + p->numa_faults[mem_idx] += diff; + p->numa_faults[cpu_idx] += f_diff; + faults += p->numa_faults[mem_idx]; p->total_numa_faults += diff; if (p->numa_group) { - /* safe because we can only change our own group */ - p->numa_group->faults[i] += diff; - p->numa_group->faults_cpu[i] += f_diff; + /* + * safe because we can only change our own group + * + * mem_idx represents the offset for a given + * nid and priv in a specific region because it + * is at the beginning of the numa_faults array. + */ + p->numa_group->faults[mem_idx] += diff; + p->numa_group->faults_cpu[mem_idx] += f_diff; p->numa_group->total_faults += diff; - group_faults += p->numa_group->faults[i]; + group_faults += p->numa_group->faults[mem_idx]; } } @@ -1625,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_group) { update_numa_active_node_mask(p->numa_group); spin_unlock_irq(group_lock); - max_nid = max_group_nid; + max_nid = preferred_group_nid(p, max_group_nid); } if (max_faults) { @@ -1668,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, atomic_set(&grp->refcount, 1); spin_lock_init(&grp->lock); - INIT_LIST_HEAD(&grp->task_list); grp->gid = p->pid; /* Second half of the array tracks nids where faults happen */ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * @@ -1677,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, node_set(task_node(current), grp->active_nodes); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) - grp->faults[i] = p->numa_faults_memory[i]; + grp->faults[i] = p->numa_faults[i]; grp->total_faults = p->total_numa_faults; - list_add(&p->numa_entry, &grp->task_list); grp->nr_tasks++; rcu_assign_pointer(p->numa_group, grp); } @@ -1736,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, double_lock_irq(&my_grp->lock, &grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { - my_grp->faults[i] -= p->numa_faults_memory[i]; - grp->faults[i] += p->numa_faults_memory[i]; + my_grp->faults[i] -= p->numa_faults[i]; + grp->faults[i] += p->numa_faults[i]; } my_grp->total_faults -= p->total_numa_faults; grp->total_faults += p->total_numa_faults; - list_move(&p->numa_entry, &grp->task_list); my_grp->nr_tasks--; grp->nr_tasks++; @@ -1762,27 +1996,23 @@ no_join: void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; - void *numa_faults = p->numa_faults_memory; + void *numa_faults = p->numa_faults; unsigned long flags; int i; if (grp) { spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) - grp->faults[i] -= p->numa_faults_memory[i]; + grp->faults[i] -= p->numa_faults[i]; grp->total_faults -= p->total_numa_faults; - list_del(&p->numa_entry); grp->nr_tasks--; spin_unlock_irqrestore(&grp->lock, flags); - rcu_assign_pointer(p->numa_group, NULL); + RCU_INIT_POINTER(p->numa_group, NULL); put_numa_group(grp); } - p->numa_faults_memory = NULL; - p->numa_faults_buffer_memory = NULL; - p->numa_faults_cpu= NULL; - p->numa_faults_buffer_cpu = NULL; + p->numa_faults = NULL; kfree(numa_faults); } @@ -1804,29 +2034,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (!p->mm) return; - /* Do not worry about placement if exiting */ - if (p->state == TASK_DEAD) - return; - /* Allocate buffer to track faults on a per-node basis */ - if (unlikely(!p->numa_faults_memory)) { - int size = sizeof(*p->numa_faults_memory) * + if (unlikely(!p->numa_faults)) { + int size = sizeof(*p->numa_faults) * NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; - p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); - if (!p->numa_faults_memory) + p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults) return; - BUG_ON(p->numa_faults_buffer_memory); - /* - * The averaged statistics, shared & private, memory & cpu, - * occupy the first half of the array. The second half of the - * array is for current counters, which are averaged into the - * first set by task_numa_placement. - */ - p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); - p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); - p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); p->total_numa_faults = 0; memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } @@ -1866,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; - p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; - p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; + p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; + p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; p->numa_faults_locality[local] += pages; } @@ -1946,7 +2162,7 @@ void task_numa_work(struct callback_head *work) vma = mm->mmap; } for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) + if (!vma_migratable(vma) || !vma_policy_mof(vma)) continue; /* @@ -2211,8 +2427,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) /* * As y^PERIOD = 1/2, we can combine - * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) - * With a look-up table which covers k^n (n<PERIOD) + * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) + * With a look-up table which covers y^n (n<PERIOD) * * To achieve constant time decay_load. */ @@ -2377,6 +2593,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; tg_contrib -= cfs_rq->tg_load_contrib; + if (!tg_contrib) + return; + if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { atomic_long_add(tg_contrib, &tg->load_avg); cfs_rq->tg_load_contrib += tg_contrib; @@ -3786,6 +4005,10 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + /* init_cfs_bandwidth() was not called */ + if (!cfs_b->throttled_cfs_rq.next) + return; + hrtimer_cancel(&cfs_b->period_timer); hrtimer_cancel(&cfs_b->slack_timer); } @@ -3892,14 +4115,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) resched_curr(rq); return; } - - /* - * Don't schedule slices shorter than 10000ns, that just - * doesn't make sense. Rely on vruntime for fairness. - */ - if (rq->curr != p) - delta = max_t(s64, 10000LL, delta); - hrtick_start(rq, delta); } } @@ -4087,7 +4302,7 @@ static unsigned long capacity_of(int cpu) static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); unsigned long load_avg = rq->cfs.runnable_load_avg; if (nr_running) @@ -4213,7 +4428,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) * wl = S * s'_i; see (2) */ if (W > 0 && w < W) - wl = (w * tg->shares) / W; + wl = (w * (long)tg->shares) / W; else wl = tg->shares; @@ -4276,8 +4491,8 @@ static int wake_wide(struct task_struct *p) static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; + s64 this_eff_load, prev_eff_load; int idx, this_cpu, prev_cpu; - unsigned long tl_per_task; struct task_group *tg; unsigned long weight; int balanced; @@ -4320,47 +4535,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - if (this_load > 0) { - s64 this_eff_load, prev_eff_load; + this_eff_load = 100; + this_eff_load *= capacity_of(prev_cpu); + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= capacity_of(this_cpu); - this_eff_load = 100; - this_eff_load *= capacity_of(prev_cpu); + if (this_load > 0) { this_eff_load *= this_load + effective_load(tg, this_cpu, weight, weight); - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= capacity_of(this_cpu); prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + } - balanced = this_eff_load <= prev_eff_load; - } else - balanced = true; - - /* - * If the currently running task will sleep within - * a reasonable amount of time then attract this newly - * woken task: - */ - if (sync && balanced) - return 1; + balanced = this_eff_load <= prev_eff_load; schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - if (balanced || - (this_load <= load && - this_load + target_load(prev_cpu, idx) <= tl_per_task)) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(sd, ttwu_move_affine); - schedstat_inc(p, se.statistics.nr_wakeups_affine); + if (!balanced) + return 0; - return 1; - } - return 0; + schedstat_inc(sd, ttwu_move_affine); + schedstat_inc(p, se.statistics.nr_wakeups_affine); + + return 1; } /* @@ -4428,20 +4626,46 @@ static int find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; - int idlest = -1; + unsigned int min_exit_latency = UINT_MAX; + u64 latest_idle_timestamp = 0; + int least_loaded_cpu = this_cpu; + int shallowest_idle_cpu = -1; int i; /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - load = weighted_cpuload(i); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; + if (idle_cpu(i)) { + struct rq *rq = cpu_rq(i); + struct cpuidle_state *idle = idle_get_state(rq); + if (idle && idle->exit_latency < min_exit_latency) { + /* + * We give priority to a CPU whose idle state + * has the smallest exit latency irrespective + * of any idle timestamp. + */ + min_exit_latency = idle->exit_latency; + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } else if ((!idle || idle->exit_latency == min_exit_latency) && + rq->idle_stamp > latest_idle_timestamp) { + /* + * If equal or no active idle state, then + * the most recently idled CPU might have + * a warmer cache. + */ + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } + } else if (shallowest_idle_cpu == -1) { + load = weighted_cpuload(i); + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + least_loaded_cpu = i; + } } } - return idlest; + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } /* @@ -4510,14 +4734,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (p->nr_cpus_allowed == 1) - return prev_cpu; - - if (sd_flag & SD_BALANCE_WAKE) { - if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) - want_affine = 1; - new_cpu = prev_cpu; - } + if (sd_flag & SD_BALANCE_WAKE) + want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -4704,7 +4922,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* - * This is possible from callers such as move_task(), in which we + * This is possible from callers such as attach_tasks(), in which we * unconditionally check_prempt_curr() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. @@ -5112,27 +5330,18 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + struct list_head tasks; }; /* - * move_task - move a task from one runqueue to another runqueue. - * Both runqueues must be locked. - */ -static void move_task(struct task_struct *p, struct lb_env *env) -{ - deactivate_task(env->src_rq, p, 0); - set_task_cpu(p, env->dst_cpu); - activate_task(env->dst_rq, p, 0); - check_preempt_curr(env->dst_rq, p, 0); -} - -/* * Is this task likely cache-hot: */ static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta; + lockdep_assert_held(&env->src_rq->lock); + if (p->sched_class != &fair_sched_class) return 0; @@ -5164,7 +5373,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || !(env->sd->flags & SD_NUMA)) { return false; } @@ -5203,7 +5412,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) return false; - if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) + if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) return false; src_nid = cpu_to_node(env->src_cpu); @@ -5252,6 +5461,9 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot = 0; + + lockdep_assert_held(&env->src_rq->lock); + /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or @@ -5310,24 +5522,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env); - if (migrate_improves_locality(p, env)) { -#ifdef CONFIG_SCHEDSTATS + if (migrate_improves_locality(p, env) || !tsk_cache_hot || + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } -#endif - return 1; - } - - if (!tsk_cache_hot || - env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - - if (tsk_cache_hot) { - schedstat_inc(env->sd, lb_hot_gained[env->idle]); - schedstat_inc(p, se.statistics.nr_forced_migrations); - } - return 1; } @@ -5336,47 +5536,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) } /* - * move_one_task tries to move exactly one task from busiest to this_rq, as + * detach_task() -- detach the task for the migration specified in env + */ +static void detach_task(struct task_struct *p, struct lb_env *env) +{ + lockdep_assert_held(&env->src_rq->lock); + + deactivate_task(env->src_rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, env->dst_cpu); +} + +/* + * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as * part of active balancing operations within "domain". - * Returns 1 if successful and 0 otherwise. * - * Called with both runqueues locked. + * Returns a task if successful and NULL otherwise. */ -static int move_one_task(struct lb_env *env) +static struct task_struct *detach_one_task(struct lb_env *env) { struct task_struct *p, *n; + lockdep_assert_held(&env->src_rq->lock); + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { if (!can_migrate_task(p, env)) continue; - move_task(p, env); + detach_task(p, env); + /* - * Right now, this is only the second place move_task() - * is called, so we can safely collect move_task() - * stats here rather than inside move_task(). + * Right now, this is only the second place where + * lb_gained[env->idle] is updated (other is detach_tasks) + * so we can safely collect stats here rather than + * inside detach_tasks(). */ schedstat_inc(env->sd, lb_gained[env->idle]); - return 1; + return p; } - return 0; + return NULL; } static const unsigned int sched_nr_migrate_break = 32; /* - * move_tasks tries to move up to imbalance weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. + * detach_tasks() -- tries to detach up to imbalance weighted load from + * busiest_rq, as part of a balancing operation within domain "sd". * - * Called with both runqueues locked. + * Returns number of detached tasks if successful and 0 otherwise. */ -static int move_tasks(struct lb_env *env) +static int detach_tasks(struct lb_env *env) { struct list_head *tasks = &env->src_rq->cfs_tasks; struct task_struct *p; unsigned long load; - int pulled = 0; + int detached = 0; + + lockdep_assert_held(&env->src_rq->lock); if (env->imbalance <= 0) return 0; @@ -5407,14 +5623,16 @@ static int move_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - move_task(p, env); - pulled++; + detach_task(p, env); + list_add(&p->se.group_node, &env->tasks); + + detached++; env->imbalance -= load; #ifdef CONFIG_PREEMPT /* * NEWIDLE balancing is a source of latency, so preemptible - * kernels will stop after the first task is pulled to minimize + * kernels will stop after the first task is detached to minimize * the critical section. */ if (env->idle == CPU_NEWLY_IDLE) @@ -5434,13 +5652,58 @@ next: } /* - * Right now, this is one of only two places move_task() is called, - * so we can safely collect move_task() stats here rather than - * inside move_task(). + * Right now, this is one of only two places we collect this stat + * so we can safely collect detach_one_task() stats here rather + * than inside detach_one_task(). */ - schedstat_add(env->sd, lb_gained[env->idle], pulled); + schedstat_add(env->sd, lb_gained[env->idle], detached); + + return detached; +} + +/* + * attach_task() -- attach the task detached by detach_task() to its new rq. + */ +static void attach_task(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_held(&rq->lock); - return pulled; + BUG_ON(task_rq(p) != rq); + p->on_rq = TASK_ON_RQ_QUEUED; + activate_task(rq, p, 0); + check_preempt_curr(rq, p, 0); +} + +/* + * attach_one_task() -- attaches the task returned from detach_one_task() to + * its new rq. + */ +static void attach_one_task(struct rq *rq, struct task_struct *p) +{ + raw_spin_lock(&rq->lock); + attach_task(rq, p); + raw_spin_unlock(&rq->lock); +} + +/* + * attach_tasks() -- attaches all tasks detached by detach_tasks() to their + * new rq. + */ +static void attach_tasks(struct lb_env *env) +{ + struct list_head *tasks = &env->tasks; + struct task_struct *p; + + raw_spin_lock(&env->dst_rq->lock); + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); + list_del_init(&p->se.group_node); + + attach_task(env->dst_rq, p); + } + + raw_spin_unlock(&env->dst_rq->lock); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -5559,6 +5822,13 @@ static unsigned long task_h_load(struct task_struct *p) #endif /********** Helpers for find_busiest_group ************************/ + +enum group_type { + group_other = 0, + group_imbalanced, + group_overloaded, +}; + /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -5572,7 +5842,7 @@ struct sg_lb_stats { unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; - int group_imb; /* Is there an imbalance in the group ? */ + enum group_type group_type; int group_has_free_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -5610,6 +5880,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .total_capacity = 0UL, .busiest_stat = { .avg_load = 0UL, + .sum_nr_running = 0, + .group_type = group_other, }, }; } @@ -5652,19 +5924,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) return default_scale_capacity(sd, cpu); } -static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) +static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long weight = sd->span_weight; - unsigned long smt_gain = sd->smt_gain; - - smt_gain /= weight; + if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + return sd->smt_gain / sd->span_weight; - return smt_gain; + return SCHED_CAPACITY_SCALE; } -unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - return default_scale_smt_capacity(sd, cpu); + return default_scale_cpu_capacity(sd, cpu); } static unsigned long scale_rt_capacity(int cpu) @@ -5703,18 +5973,15 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long weight = sd->span_weight; unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; - if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_smt_capacity(sd, cpu); - else - capacity *= default_scale_smt_capacity(sd, cpu); + if (sched_feat(ARCH_CAPACITY)) + capacity *= arch_scale_cpu_capacity(sd, cpu); + else + capacity *= default_scale_cpu_capacity(sd, cpu); - capacity >>= SCHED_CAPACITY_SHIFT; - } + capacity >>= SCHED_CAPACITY_SHIFT; sdg->sgc->capacity_orig = capacity; @@ -5891,6 +6158,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro return capacity_factor; } +static enum group_type +group_classify(struct sched_group *group, struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running > sgs->group_capacity_factor) + return group_overloaded; + + if (sg_imbalanced(group)) + return group_imbalanced; + + return group_other; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -5920,7 +6199,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->sum_nr_running += rq->nr_running; + sgs->sum_nr_running += rq->cfs.h_nr_running; if (rq->nr_running > 1) *overload = true; @@ -5942,9 +6221,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; sgs->group_weight = group->group_weight; - - sgs->group_imb = sg_imbalanced(group); sgs->group_capacity_factor = sg_capacity_factor(env, group); + sgs->group_type = group_classify(group, sgs); if (sgs->group_capacity_factor > sgs->sum_nr_running) sgs->group_has_free_capacity = 1; @@ -5968,13 +6246,19 @@ static bool update_sd_pick_busiest(struct lb_env *env, struct sched_group *sg, struct sg_lb_stats *sgs) { - if (sgs->avg_load <= sds->busiest_stat.avg_load) - return false; + struct sg_lb_stats *busiest = &sds->busiest_stat; - if (sgs->sum_nr_running > sgs->group_capacity_factor) + if (sgs->group_type > busiest->group_type) return true; - if (sgs->group_imb) + if (sgs->group_type < busiest->group_type) + return false; + + if (sgs->avg_load <= busiest->avg_load) + return false; + + /* This is the busiest node in its class. */ + if (!(env->sd->flags & SD_ASYM_PACKING)) return true; /* @@ -5982,8 +6266,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * numbered CPUs in the group, therefore mark all groups * higher than ourself as busy. */ - if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && - env->dst_cpu < group_first_cpu(sg)) { + if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { if (!sds->busiest) return true; @@ -6073,8 +6356,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * with a large weight task outweighs the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_free_capacity) + sds->local_stat.group_has_free_capacity) { sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); + sgs->group_type = group_classify(sg, sgs); + } if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -6228,7 +6513,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s local = &sds->local_stat; busiest = &sds->busiest_stat; - if (busiest->group_imb) { + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages * to ensure cpu-load equilibrium, look at wider averages. XXX @@ -6248,12 +6533,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return fix_small_imbalance(env, sds); } - if (!busiest->group_imb) { - /* - * Don't want to pull so many tasks that a group would go idle. - * Except of course for the group_imb case, since then we might - * have to drop below capacity to reach cpu-load equilibrium. - */ + /* + * If there aren't any idle cpus, avoid creating some. + */ + if (busiest->group_type == group_overloaded && + local->group_type == group_overloaded) { load_above_capacity = (busiest->sum_nr_running - busiest->group_capacity_factor); @@ -6337,7 +6621,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * work because they assume all things are equal, which typically * isn't true due to cpus_allowed constraints and the like. */ - if (busiest->group_imb) + if (busiest->group_type == group_imbalanced) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ @@ -6346,7 +6630,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* - * If the local group is more busy than the selected busiest group + * If the local group is busier than the selected busiest group * don't try and pull any tasks. */ if (local->avg_load >= busiest->avg_load) @@ -6361,13 +6645,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (env->idle == CPU_IDLE) { /* - * This cpu is idle. If the busiest group load doesn't - * have more tasks than the number of available cpu's and - * there is no imbalance between this and busiest group - * wrt to idle cpu's, it is balanced. + * This cpu is idle. If the busiest group is not overloaded + * and there is no imbalance between this and busiest group + * wrt idle cpus, it is balanced. The imbalance becomes + * significant if the diff is greater than 1 otherwise we + * might end up to just move the imbalance on another group */ - if ((local->idle_cpus < busiest->idle_cpus) && - busiest->sum_nr_running <= busiest->group_weight) + if ((busiest->group_type != group_overloaded) && + (local->idle_cpus <= (busiest->idle_cpus + 1))) goto out_balanced; } else { /* @@ -6539,7 +6824,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_group *group; struct rq *busiest; unsigned long flags; - struct cpumask *cpus = __get_cpu_var(load_balance_mask); + struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { .sd = sd, @@ -6550,6 +6835,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .loop_break = sched_nr_migrate_break, .cpus = cpus, .fbq_type = all, + .tasks = LIST_HEAD_INIT(env.tasks), }; /* @@ -6599,23 +6885,30 @@ redo: env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: - local_irq_save(flags); - double_rq_lock(env.dst_rq, busiest); + raw_spin_lock_irqsave(&busiest->lock, flags); /* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations */ - cur_ld_moved = move_tasks(&env); - ld_moved += cur_ld_moved; - double_rq_unlock(env.dst_rq, busiest); - local_irq_restore(flags); + cur_ld_moved = detach_tasks(&env); /* - * some other cpu did the load balance for us. + * We've detached some tasks from busiest_rq. Every + * task is masked "TASK_ON_RQ_MIGRATING", so we can safely + * unlock busiest->lock, and we are able to be sure + * that nobody can manipulate the tasks in parallel. + * See task_rq_lock() family for the details. */ - if (cur_ld_moved && env.dst_cpu != smp_processor_id()) - resched_cpu(env.dst_cpu); + + raw_spin_unlock(&busiest->lock); + + if (cur_ld_moved) { + attach_tasks(&env); + ld_moved += cur_ld_moved; + } + + local_irq_restore(flags); if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; @@ -6665,10 +6958,8 @@ more_balance: if (sd_parent) { int *group_imbalance = &sd_parent->groups->sgc->imbalance; - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) *group_imbalance = 1; - } else if (*group_imbalance) - *group_imbalance = 0; } /* All tasks on this runqueue were pinned by CPU affinity */ @@ -6679,7 +6970,7 @@ more_balance: env.loop_break = sched_nr_migrate_break; goto redo; } - goto out_balanced; + goto out_all_pinned; } } @@ -6744,7 +7035,7 @@ more_balance: * If we've begun active balancing, start to back off. This * case may not be covered by the all_pinned logic if there * is only 1 task on the busy runqueue (because we don't call - * move_tasks). + * detach_tasks). */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; @@ -6753,6 +7044,23 @@ more_balance: goto out; out_balanced: + /* + * We reach balance although we may have faced some affinity + * constraints. Clear the imbalance flag if it was set. + */ + if (sd_parent) { + int *group_imbalance = &sd_parent->groups->sgc->imbalance; + + if (*group_imbalance) + *group_imbalance = 0; + } + +out_all_pinned: + /* + * We reach balance because all tasks are pinned at this level so + * we can't migrate them. Let the imbalance flag set so parent level + * can try to migrate them. + */ schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0; @@ -6914,6 +7222,7 @@ static int active_load_balance_cpu_stop(void *data) int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; + struct task_struct *p = NULL; raw_spin_lock_irq(&busiest_rq->lock); @@ -6933,9 +7242,6 @@ static int active_load_balance_cpu_stop(void *data) */ BUG_ON(busiest_rq == target_rq); - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { @@ -6956,16 +7262,22 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); - if (move_one_task(&env)) + p = detach_one_task(&env); + if (p) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); } rcu_read_unlock(); - double_unlock_balance(busiest_rq, target_rq); out_unlock: busiest_rq->active_balance = 0; - raw_spin_unlock_irq(&busiest_rq->lock); + raw_spin_unlock(&busiest_rq->lock); + + if (p) + attach_one_task(target_rq, p); + + local_irq_enable(); + return 0; } @@ -7465,7 +7777,7 @@ static void task_fork_fair(struct task_struct *p) static void prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->se.on_rq) + if (!task_on_rq_queued(p)) return; /* @@ -7490,11 +7802,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it's on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it's !on_rq, then only when + * If it's queued, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !queued, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!p->on_rq && p->state != TASK_RUNNING) { + if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -7521,15 +7833,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { - struct sched_entity *se = &p->se; #ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *se = &p->se; /* * Since the real-depth could have been changed (only FAIR * class maintain depth value), reset depth properly. */ se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - if (!se->on_rq) + if (!task_on_rq_queued(p)) return; /* @@ -7575,7 +7887,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_move_group_fair(struct task_struct *p, int on_rq) +static void task_move_group_fair(struct task_struct *p, int queued) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq; @@ -7594,7 +7906,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * fair sleeper stuff for the first placement, but who cares. */ /* - * When !on_rq, vruntime of the task has usually NOT been normalized. + * When !queued, vruntime of the task has usually NOT been normalized. * But there are some cases where it has already been normalized: * * - Moving a forked child which is waiting for being woken up by @@ -7605,14 +7917,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * To prevent boost or penalty in the new cfs_rq caused by delta * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. */ - if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) - on_rq = 1; + if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) + queued = 1; - if (!on_rq) + if (!queued) se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); se->depth = se->parent ? se->parent->depth + 1 : 0; - if (!on_rq) { + if (!queued) { cfs_rq = cfs_rq_of(se); se->vruntime += cfs_rq->min_vruntime; #ifdef CONFIG_SMP @@ -7835,6 +8147,8 @@ const struct sched_class fair_sched_class = { .get_rr_interval = get_rr_interval_fair, + .update_curr = update_curr_fair, + #ifdef CONFIG_FAIR_GROUP_SCHED .task_move_group = task_move_group_fair, #endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 11e7bc434f43..c47fce75e666 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -147,6 +147,9 @@ use_default: clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) goto use_default; + /* Take note of the planned idle state. */ + idle_set_state(this_rq(), &drv->states[next_state]); + /* * Enter the idle state previously returned by the governor decision. * This function will block until an interrupt occurs and will take @@ -154,6 +157,9 @@ use_default: */ entered_state = cpuidle_enter(drv, dev, next_state); + /* The cpu is no longer idle or about to enter idle. */ + idle_set_state(this_rq(), NULL); + if (broadcast) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 67ad4e7f506a..c65dac8c97cd 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -75,6 +75,10 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task return 0; } +static void update_curr_idle(struct rq *rq) +{ +} + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -101,4 +105,5 @@ const struct sched_class idle_sched_class = { .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, + .update_curr = update_curr_idle, }; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5f6edca4fafd..ee15f5a0d1c1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1301,9 +1301,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) struct task_struct *curr; struct rq *rq; - if (p->nr_cpus_allowed == 1) - goto out; - /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) goto out; @@ -1351,16 +1348,22 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - if (rq->curr->nr_cpus_allowed == 1) + /* + * Current can't be migrated, useless to reschedule, + * let's hope p can move out. + */ + if (rq->curr->nr_cpus_allowed == 1 || + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) return; + /* + * p is migratable, so let's not schedule it and + * see if it is pushed or pulled somewhere else. + */ if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; - if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) - return; - /* * There appears to be other cpus that can accept * current and none to run 'p', so lets reschedule @@ -1448,7 +1451,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) * means a dl or stop task can slip in, in which case we need * to re-start task selection. */ - if (unlikely((rq->stop && rq->stop->on_rq) || + if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || rq->dl.dl_nr_running)) return RETRY_TASK; } @@ -1468,8 +1471,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) p = _pick_next_task_rt(rq); /* The running task is never eligible for pushing */ - if (p) - dequeue_pushable_task(rq, p); + dequeue_pushable_task(rq, p); set_post_schedule(rq); @@ -1526,7 +1528,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); + struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); @@ -1624,7 +1626,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(lowest_rq->cpu, tsk_cpus_allowed(task)) || task_running(rq, task) || - !task->on_rq)) { + !task_on_rq_queued(task))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1658,7 +1660,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); - BUG_ON(!p->on_rq); + BUG_ON(!task_on_rq_queued(p)); BUG_ON(!rt_task(p)); return p; @@ -1809,7 +1811,7 @@ static int pull_rt_task(struct rq *this_rq) */ if (p && (p->prio < this_rq->rt.highest_prio.curr)) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->on_rq); + WARN_ON(!task_on_rq_queued(p)); /* * There's a chance that p is higher in priority @@ -1870,7 +1872,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, BUG_ON(!rt_task(p)); - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; weight = cpumask_weight(new_mask); @@ -1936,7 +1938,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!p->on_rq || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) return; if (pull_rt_task(rq)) @@ -1970,7 +1972,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (p->on_rq && rq->curr != p) { + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && /* Don't resched if we changed runqueues */ @@ -1989,7 +1991,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) static void prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; if (rq->curr == p) { @@ -2073,7 +2075,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) for_each_sched_rt_entity(rt_se) { if (rt_se->run_list.prev != rt_se->run_list.next) { requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); + resched_curr(rq); return; } } @@ -2129,6 +2131,8 @@ const struct sched_class rt_sched_class = { .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, + + .update_curr = update_curr_rt, }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 579712f4e9d5..9a2a45c970e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -14,6 +14,11 @@ #include "cpuacct.h" struct rq; +struct cpuidle_state; + +/* task_struct::on_rq states: */ +#define TASK_ON_RQ_QUEUED 1 +#define TASK_ON_RQ_MIGRATING 2 extern __read_mostly int scheduler_running; @@ -126,6 +131,9 @@ struct rt_bandwidth { u64 rt_runtime; struct hrtimer rt_period_timer; }; + +void __dl_clear_params(struct task_struct *p); + /* * To keep the bandwidth of -deadline tasks and groups under control * we need some place where: @@ -168,6 +176,25 @@ struct dl_bw { u64 bw, total_bw; }; +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + extern struct mutex sched_domains_mutex; #ifdef CONFIG_CGROUP_SCHED @@ -184,7 +211,7 @@ struct cfs_bandwidth { raw_spinlock_t lock; ktime_t period; u64 quota, runtime; - s64 hierarchal_quota; + s64 hierarchical_quota; u64 runtime_expires; int idle, timer_active; @@ -636,6 +663,11 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif + +#ifdef CONFIG_CPU_IDLE + /* Must be inspected within a rcu lock section */ + struct cpuidle_state *idle_state; +#endif }; static inline int cpu_of(struct rq *rq) @@ -647,13 +679,13 @@ static inline int cpu_of(struct rq *rq) #endif } -DECLARE_PER_CPU(struct rq, runqueues); +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() (&__get_cpu_var(runqueues)) +#define this_rq() this_cpu_ptr(&runqueues) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define raw_rq() (&__raw_get_cpu_var(runqueues)) +#define raw_rq() raw_cpu_ptr(&runqueues) static inline u64 rq_clock(struct rq *rq) { @@ -665,7 +697,25 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } +#ifdef CONFIG_NUMA +enum numa_topology_type { + NUMA_DIRECT, + NUMA_GLUELESS_MESH, + NUMA_BACKPLANE, +}; +extern enum numa_topology_type sched_numa_topology_type; +extern int sched_max_numa_distance; +extern bool find_numa_distance(int distance); +#endif + #ifdef CONFIG_NUMA_BALANCING +/* The regions in numa_faults array from task_struct */ +enum numa_faults_stats { + NUMA_MEM = 0, + NUMA_CPU, + NUMA_MEMBUF, + NUMA_CPUBUF +}; extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *, struct task_struct *); @@ -942,6 +992,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p) #endif } +static inline int task_on_rq_queued(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_QUEUED; +} + +static inline int task_on_rq_migrating(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_MIGRATING; +} #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) @@ -953,7 +1012,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p) # define finish_arch_post_lock_switch() do { } while (0) #endif -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { #ifdef CONFIG_SMP @@ -991,35 +1049,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) raw_spin_unlock_irq(&rq->lock); } -#else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif - raw_spin_unlock(&rq->lock); -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->on_cpu = 0; -#endif - local_irq_enable(); -} -#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ - /* * wake flags */ @@ -1135,6 +1164,11 @@ struct sched_class { void (*task_fork) (struct task_struct *p); void (*task_dead) (struct task_struct *p); + /* + * The switched_from() call is allowed to drop rq->lock, therefore we + * cannot assume the switched_from/switched_to pair is serliazed by + * rq->lock. They are however serialized by p->pi_lock. + */ void (*switched_from) (struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, @@ -1143,6 +1177,8 @@ struct sched_class { unsigned int (*get_rr_interval) (struct rq *rq, struct task_struct *task); + void (*update_curr) (struct rq *rq); + #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_move_group) (struct task_struct *p, int on_rq); #endif @@ -1180,6 +1216,30 @@ static inline void idle_exit_fair(struct rq *rq) { } #endif +#ifdef CONFIG_CPU_IDLE +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ + rq->idle_state = idle_state; +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + WARN_ON(!rcu_read_lock_held()); + return rq->idle_state; +} +#else +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + return NULL; +} +#endif + extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); @@ -1486,6 +1546,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); +extern void print_dl_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index bfe0edadbfbb..79ffec45a6ac 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *stop = rq->stop; - if (!stop || !stop->on_rq) + if (!stop || !task_on_rq_queued(stop)) return NULL; put_prev_task(rq, prev); @@ -102,6 +102,10 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) return 0; } +static void update_curr_stop(struct rq *rq) +{ +} + /* * Simple, special scheduling class for the per-CPU stop tasks: */ @@ -128,4 +132,5 @@ const struct sched_class stop_sched_class = { .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, + .update_curr = update_curr_stop, }; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 15cab1a4f84e..852143a79f36 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/wait.h> #include <linux/hash.h> +#include <linux/kthread.h> void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) { @@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void * } EXPORT_SYMBOL(autoremove_wake_function); +static inline bool is_kthread_should_stop(void) +{ + return (current->flags & PF_KTHREAD) && kthread_should_stop(); +} + +/* + * DEFINE_WAIT_FUNC(wait, woken_wake_func); + * + * add_wait_queue(&wq, &wait); + * for (;;) { + * if (condition) + * break; + * + * p->state = mode; condition = true; + * smp_mb(); // A smp_wmb(); // C + * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; + * schedule() try_to_wake_up(); + * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ + * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; + * smp_mb() // B smp_wmb(); // C + * wait->flags |= WQ_FLAG_WOKEN; + * } + * remove_wait_queue(&wq, &wait); + * + */ +long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) +{ + set_current_state(mode); /* A */ + /* + * The above implies an smp_mb(), which matches with the smp_wmb() from + * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must + * also observe all state before the wakeup. + */ + if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) + timeout = schedule_timeout(timeout); + __set_current_state(TASK_RUNNING); + + /* + * The below implies an smp_mb(), it too pairs with the smp_wmb() from + * woken_wake_function() such that we must either observe the wait + * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss + * an event. + */ + set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ + + return timeout; +} +EXPORT_SYMBOL(wait_woken); + +int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + /* + * Although this function is called under waitqueue lock, LOCK + * doesn't imply write barrier and the users expects write + * barrier semantics on wakeup functions. The following + * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() + * and is paired with set_mb() in wait_woken(). + */ + smp_wmb(); /* C */ + wait->flags |= WQ_FLAG_WOKEN; + + return default_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(woken_wake_function); + int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; @@ -343,6 +409,18 @@ int __sched out_of_line_wait_on_bit(void *word, int bit, } EXPORT_SYMBOL(out_of_line_wait_on_bit); +int __sched out_of_line_wait_on_bit_timeout( + void *word, int bit, wait_bit_action_f *action, + unsigned mode, unsigned long timeout) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + wait.key.timeout = jiffies + timeout; + return __wait_on_bit(wq, &wait, action, mode); +} +EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); + int __sched __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, wait_bit_action_f *action, unsigned mode) @@ -520,3 +598,27 @@ __sched int bit_wait_io(struct wait_bit_key *word) return 0; } EXPORT_SYMBOL(bit_wait_io); + +__sched int bit_wait_timeout(struct wait_bit_key *word) +{ + unsigned long now = ACCESS_ONCE(jiffies); + if (signal_pending_state(current->state, current)) + return 1; + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + schedule_timeout(word->timeout - now); + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_timeout); + +__sched int bit_wait_io_timeout(struct wait_bit_key *word) +{ + unsigned long now = ACCESS_ONCE(jiffies); + if (signal_pending_state(current->state, current)) + return 1; + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + io_schedule_timeout(word->timeout - now); + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_io_timeout); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 44eb005c6695..4ef9687ac115 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -21,10 +21,11 @@ #include <linux/slab.h> #include <linux/syscalls.h> -/* #define SECCOMP_DEBUG 1 */ +#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER +#include <asm/syscall.h> +#endif #ifdef CONFIG_SECCOMP_FILTER -#include <asm/syscall.h> #include <linux/filter.h> #include <linux/pid.h> #include <linux/ptrace.h> @@ -172,10 +173,10 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) * * Returns valid seccomp BPF response codes. */ -static u32 seccomp_run_filters(int syscall) +static u32 seccomp_run_filters(struct seccomp_data *sd) { struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); - struct seccomp_data sd; + struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; /* Ensure unexpected behavior doesn't result in failing open. */ @@ -185,14 +186,17 @@ static u32 seccomp_run_filters(int syscall) /* Make sure cross-thread synced filter points somewhere sane. */ smp_read_barrier_depends(); - populate_seccomp_data(&sd); + if (!sd) { + populate_seccomp_data(&sd_local); + sd = &sd_local; + } /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ for (; f; f = f->prev) { - u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd); + u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; @@ -395,16 +399,15 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) if (!filter) goto free_prog; - filter->prog = kzalloc(bpf_prog_size(new_len), - GFP_KERNEL|__GFP_NOWARN); + filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN); if (!filter->prog) goto free_filter; ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); if (ret) goto free_filter_prog; - kfree(fp); + kfree(fp); atomic_set(&filter->usage, 1); filter->prog->len = new_len; @@ -413,7 +416,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return filter; free_filter_prog: - kfree(filter->prog); + __bpf_prog_free(filter->prog); free_filter: kfree(filter); free_prog: @@ -564,11 +567,55 @@ static int mode1_syscalls_32[] = { }; #endif -int __secure_computing(int this_syscall) +static void __secure_computing_strict(int this_syscall) +{ + int *syscall_whitelist = mode1_syscalls; +#ifdef CONFIG_COMPAT + if (is_compat_task()) + syscall_whitelist = mode1_syscalls_32; +#endif + do { + if (*syscall_whitelist == this_syscall) + return; + } while (*++syscall_whitelist); + +#ifdef SECCOMP_DEBUG + dump_stack(); +#endif + audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL); + do_exit(SIGKILL); +} + +#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER +void secure_computing_strict(int this_syscall) +{ + int mode = current->seccomp.mode; + + if (mode == 0) + return; + else if (mode == SECCOMP_MODE_STRICT) + __secure_computing_strict(this_syscall); + else + BUG(); +} +#else +int __secure_computing(void) +{ + u32 phase1_result = seccomp_phase1(NULL); + + if (likely(phase1_result == SECCOMP_PHASE1_OK)) + return 0; + else if (likely(phase1_result == SECCOMP_PHASE1_SKIP)) + return -1; + else + return seccomp_phase2(phase1_result); +} + +#ifdef CONFIG_SECCOMP_FILTER +static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) { - int exit_sig = 0; - int *syscall; - u32 ret; + u32 filter_ret, action; + int data; /* * Make sure that any changes to mode from another thread have @@ -576,85 +623,127 @@ int __secure_computing(int this_syscall) */ rmb(); - switch (current->seccomp.mode) { - case SECCOMP_MODE_STRICT: - syscall = mode1_syscalls; -#ifdef CONFIG_COMPAT - if (is_compat_task()) - syscall = mode1_syscalls_32; + filter_ret = seccomp_run_filters(sd); + data = filter_ret & SECCOMP_RET_DATA; + action = filter_ret & SECCOMP_RET_ACTION; + + switch (action) { + case SECCOMP_RET_ERRNO: + /* Set the low-order 16-bits as a errno. */ + syscall_set_return_value(current, task_pt_regs(current), + -data, 0); + goto skip; + + case SECCOMP_RET_TRAP: + /* Show the handler the original registers. */ + syscall_rollback(current, task_pt_regs(current)); + /* Let the filter pass back 16 bits of data. */ + seccomp_send_sigsys(this_syscall, data); + goto skip; + + case SECCOMP_RET_TRACE: + return filter_ret; /* Save the rest for phase 2. */ + + case SECCOMP_RET_ALLOW: + return SECCOMP_PHASE1_OK; + + case SECCOMP_RET_KILL: + default: + audit_seccomp(this_syscall, SIGSYS, action); + do_exit(SIGSYS); + } + + unreachable(); + +skip: + audit_seccomp(this_syscall, 0, action); + return SECCOMP_PHASE1_SKIP; +} #endif - do { - if (*syscall == this_syscall) - return 0; - } while (*++syscall); - exit_sig = SIGKILL; - ret = SECCOMP_RET_KILL; - break; + +/** + * seccomp_phase1() - run fast path seccomp checks on the current syscall + * @arg sd: The seccomp_data or NULL + * + * This only reads pt_regs via the syscall_xyz helpers. The only change + * it will make to pt_regs is via syscall_set_return_value, and it will + * only do that if it returns SECCOMP_PHASE1_SKIP. + * + * If sd is provided, it will not read pt_regs at all. + * + * It may also call do_exit or force a signal; these actions must be + * safe. + * + * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should + * be processed normally. + * + * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be + * invoked. In this case, seccomp_phase1 will have set the return value + * using syscall_set_return_value. + * + * If it returns anything else, then the return value should be passed + * to seccomp_phase2 from a context in which ptrace hooks are safe. + */ +u32 seccomp_phase1(struct seccomp_data *sd) +{ + int mode = current->seccomp.mode; + int this_syscall = sd ? sd->nr : + syscall_get_nr(current, task_pt_regs(current)); + + switch (mode) { + case SECCOMP_MODE_STRICT: + __secure_computing_strict(this_syscall); /* may call do_exit */ + return SECCOMP_PHASE1_OK; #ifdef CONFIG_SECCOMP_FILTER - case SECCOMP_MODE_FILTER: { - int data; - struct pt_regs *regs = task_pt_regs(current); - ret = seccomp_run_filters(this_syscall); - data = ret & SECCOMP_RET_DATA; - ret &= SECCOMP_RET_ACTION; - switch (ret) { - case SECCOMP_RET_ERRNO: - /* Set the low-order 16-bits as a errno. */ - syscall_set_return_value(current, regs, - -data, 0); - goto skip; - case SECCOMP_RET_TRAP: - /* Show the handler the original registers. */ - syscall_rollback(current, regs); - /* Let the filter pass back 16 bits of data. */ - seccomp_send_sigsys(this_syscall, data); - goto skip; - case SECCOMP_RET_TRACE: - /* Skip these calls if there is no tracer. */ - if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { - syscall_set_return_value(current, regs, - -ENOSYS, 0); - goto skip; - } - /* Allow the BPF to provide the event message */ - ptrace_event(PTRACE_EVENT_SECCOMP, data); - /* - * The delivery of a fatal signal during event - * notification may silently skip tracer notification. - * Terminating the task now avoids executing a system - * call that may not be intended. - */ - if (fatal_signal_pending(current)) - break; - if (syscall_get_nr(current, regs) < 0) - goto skip; /* Explicit request to skip. */ - - return 0; - case SECCOMP_RET_ALLOW: - return 0; - case SECCOMP_RET_KILL: - default: - break; - } - exit_sig = SIGSYS; - break; - } + case SECCOMP_MODE_FILTER: + return __seccomp_phase1_filter(this_syscall, sd); #endif default: BUG(); } +} -#ifdef SECCOMP_DEBUG - dump_stack(); -#endif - audit_seccomp(this_syscall, exit_sig, ret); - do_exit(exit_sig); -#ifdef CONFIG_SECCOMP_FILTER -skip: - audit_seccomp(this_syscall, exit_sig, ret); -#endif - return -1; +/** + * seccomp_phase2() - finish slow path seccomp work for the current syscall + * @phase1_result: The return value from seccomp_phase1() + * + * This must be called from a context in which ptrace hooks can be used. + * + * Returns 0 if the syscall should be processed or -1 to skip the syscall. + */ +int seccomp_phase2(u32 phase1_result) +{ + struct pt_regs *regs = task_pt_regs(current); + u32 action = phase1_result & SECCOMP_RET_ACTION; + int data = phase1_result & SECCOMP_RET_DATA; + + BUG_ON(action != SECCOMP_RET_TRACE); + + audit_seccomp(syscall_get_nr(current, regs), 0, action); + + /* Skip these calls if there is no tracer. */ + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { + syscall_set_return_value(current, regs, + -ENOSYS, 0); + return -1; + } + + /* Allow the BPF to provide the event message */ + ptrace_event(PTRACE_EVENT_SECCOMP, data); + /* + * The delivery of a fatal signal during event + * notification may silently skip tracer notification. + * Terminating the task now avoids executing a system + * call that may not be intended. + */ + if (fatal_signal_pending(current)) + do_exit(SIGSYS); + if (syscall_get_nr(current, regs) < 0) + return -1; /* Explicit request to skip. */ + + return 0; } +#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ long prctl_get_seccomp(void) { diff --git a/kernel/signal.c b/kernel/signal.c index 8f0876f9f6dd..16a305295256 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1275,7 +1275,17 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, local_irq_restore(*flags); break; } - + /* + * This sighand can be already freed and even reused, but + * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which + * initializes ->siglock: this slab can't go away, it has + * the same object type, ->siglock can't be reinitialized. + * + * We need to ensure that tsk->sighand is still the same + * after we take the lock, we can race with de_thread() or + * __exit_signal(). In the latter case the next iteration + * must see ->sighand == NULL. + */ spin_lock(&sighand->siglock); if (likely(sighand == tsk->sighand)) { rcu_read_unlock(); @@ -1331,23 +1341,21 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) int error = -ESRCH; struct task_struct *p; - rcu_read_lock(); -retry: - p = pid_task(pid, PIDTYPE_PID); - if (p) { - error = group_send_sig_info(sig, info, p); - if (unlikely(error == -ESRCH)) - /* - * The task was unhashed in between, try again. - * If it is dead, pid_task() will return NULL, - * if we race with de_thread() it will find the - * new leader. - */ - goto retry; - } - rcu_read_unlock(); + for (;;) { + rcu_read_lock(); + p = pid_task(pid, PIDTYPE_PID); + if (p) + error = group_send_sig_info(sig, info, p); + rcu_read_unlock(); + if (likely(!p || error != -ESRCH)) + return error; - return error; + /* + * The task was unhashed in between, try again. If it + * is dead, pid_task() will return NULL, if we race with + * de_thread() it will find the new leader. + */ + } } int kill_proc_info(int sig, struct siginfo *info, pid_t pid) @@ -2748,6 +2756,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); #endif +#ifdef SEGV_BNDERR + err |= __put_user(from->si_lower, &to->si_lower); + err |= __put_user(from->si_upper, &to->si_upper); +#endif break; case __SI_CHLD: err |= __put_user(from->si_pid, &to->si_pid); diff --git a/kernel/smp.c b/kernel/smp.c index aff8aa14f547..f38a1e692259 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -13,6 +13,7 @@ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/cpu.h> +#include <linux/sched.h> #include "smpboot.h" @@ -164,7 +165,7 @@ static int generic_exec_single(int cpu, struct call_single_data *csd, if (!csd) { csd = &csd_stack; if (!wait) - csd = &__get_cpu_var(csd_data); + csd = this_cpu_ptr(&csd_data); } csd_lock(csd); @@ -229,7 +230,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) WARN_ON(!irqs_disabled()); - head = &__get_cpu_var(call_single_queue); + head = this_cpu_ptr(&call_single_queue); entry = llist_del_all(head); entry = llist_reverse_order(entry); @@ -419,7 +420,7 @@ void smp_call_function_many(const struct cpumask *mask, return; } - cfd = &__get_cpu_var(cfd_data); + cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, cfd->cpumask); @@ -699,3 +700,24 @@ void kick_all_cpus_sync(void) smp_call_function(do_nothing, NULL, 1); } EXPORT_SYMBOL_GPL(kick_all_cpus_sync); + +/** + * wake_up_all_idle_cpus - break all cpus out of idle + * wake_up_all_idle_cpus try to break all cpus which is in idle state even + * including idle polling cpus, for non-idle cpus, we will do nothing + * for them. + */ +void wake_up_all_idle_cpus(void) +{ + int cpu; + + preempt_disable(); + for_each_online_cpu(cpu) { + if (cpu == smp_processor_id()) + continue; + + wake_up_if_idle(cpu); + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index eb89e1807408..f032fb5284e3 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data) set_current_state(TASK_INTERRUPTIBLE); preempt_disable(); if (kthread_should_stop()) { - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->cleanup) ht->cleanup(td->cpu, cpu_online(td->cpu)); @@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data) /* Check for state change setup */ switch (td->status) { case HP_THREAD_NONE: + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->setup) ht->setup(td->cpu); td->status = HP_THREAD_ACTIVE; - preempt_disable(); - break; + continue; + case HP_THREAD_PARKED: + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->unpark) ht->unpark(td->cpu); td->status = HP_THREAD_ACTIVE; - preempt_disable(); - break; + continue; } if (!ht->thread_should_run(td->cpu)) { - preempt_enable(); + preempt_enable_no_resched(); schedule(); } else { - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); preempt_enable(); ht->thread_fn(td->cpu); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 5918d227730f..501baa9ac1be 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -278,7 +278,7 @@ restart: pending >>= softirq_bit; } - rcu_bh_qs(smp_processor_id()); + rcu_bh_qs(); local_irq_disable(); pending = local_softirq_pending(); @@ -485,7 +485,7 @@ static void tasklet_action(struct softirq_action *a) local_irq_disable(); list = __this_cpu_read(tasklet_vec.head); __this_cpu_write(tasklet_vec.head, NULL); - __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); + __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head)); local_irq_enable(); while (list) { @@ -521,7 +521,7 @@ static void tasklet_hi_action(struct softirq_action *a) local_irq_disable(); list = __this_cpu_read(tasklet_hi_vec.head); __this_cpu_write(tasklet_hi_vec.head, NULL); - __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); + __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head)); local_irq_enable(); while (list) { @@ -656,7 +656,7 @@ static void run_ksoftirqd(unsigned int cpu) * in the task stack here. */ __do_softirq(); - rcu_note_context_switch(cpu); + rcu_note_context_switch(); local_irq_enable(); cond_resched(); return; diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 00fe55cc5a82..b6e4c16377c7 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces) } EXPORT_SYMBOL_GPL(print_stack_trace); +int snprint_stack_trace(char *buf, size_t size, + struct stack_trace *trace, int spaces) +{ + int i; + unsigned long ip; + int generated; + int total = 0; + + if (WARN_ON(!trace->entries)) + return 0; + + for (i = 0; i < trace->nr_entries; i++) { + ip = trace->entries[i]; + generated = snprintf(buf, size, "%*c[<%p>] %pS\n", + 1 + spaces, ' ', (void *) ip, (void *) ip); + + total += generated; + + /* Assume that generated isn't a negative number */ + if (generated >= size) { + buf += size; + size = 0; + } else { + buf += generated; + size -= generated; + } + } + + return total; +} +EXPORT_SYMBOL_GPL(snprint_stack_trace); + /* * Architectures that do not implement save_stack_trace_tsk or * save_stack_trace_regs get this weak alias and a once-per-bootup warning diff --git a/kernel/sys.c b/kernel/sys.c index ce8129192a26..a8c9f5a7dda6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -62,28 +62,28 @@ #include <asm/unistd.h> #ifndef SET_UNALIGN_CTL -# define SET_UNALIGN_CTL(a,b) (-EINVAL) +# define SET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef GET_UNALIGN_CTL -# define GET_UNALIGN_CTL(a,b) (-EINVAL) +# define GET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEMU_CTL -# define SET_FPEMU_CTL(a,b) (-EINVAL) +# define SET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEMU_CTL -# define GET_FPEMU_CTL(a,b) (-EINVAL) +# define GET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEXC_CTL -# define SET_FPEXC_CTL(a,b) (-EINVAL) +# define SET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEXC_CTL -# define GET_FPEXC_CTL(a,b) (-EINVAL) +# define GET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_ENDIAN -# define GET_ENDIAN(a,b) (-EINVAL) +# define GET_ENDIAN(a, b) (-EINVAL) #endif #ifndef SET_ENDIAN -# define SET_ENDIAN(a,b) (-EINVAL) +# define SET_ENDIAN(a, b) (-EINVAL) #endif #ifndef GET_TSC_CTL # define GET_TSC_CTL(a) (-EINVAL) @@ -91,6 +91,12 @@ #ifndef SET_TSC_CTL # define SET_TSC_CTL(a) (-EINVAL) #endif +#ifndef MPX_ENABLE_MANAGEMENT +# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL) +#endif +#ifndef MPX_DISABLE_MANAGEMENT +# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -182,39 +188,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) - error = set_one_prio(p, niceval, error); - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - error = set_one_prio(p, niceval, error); - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - uid = make_kuid(cred->user_ns, who); - user = cred->user; - if (!who) - uid = cred->uid; - else if (!uid_eq(uid, cred->uid) && - !(user = find_user(uid))) + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) + error = set_one_prio(p, niceval, error); + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + error = set_one_prio(p, niceval, error); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + uid = make_kuid(cred->user_ns, who); + user = cred->user; + if (!who) + uid = cred->uid; + else if (!uid_eq(uid, cred->uid)) { + user = find_user(uid); + if (!user) goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) - error = set_one_prio(p, niceval, error); - } while_each_thread(g, p); - if (!uid_eq(uid, cred->uid)) - free_uid(user); /* For find_user() */ - break; + } + do_each_thread(g, p) { + if (uid_eq(task_uid(p), uid)) + error = set_one_prio(p, niceval, error); + } while_each_thread(g, p); + if (!uid_eq(uid, cred->uid)) + free_uid(user); /* For find_user() */ + break; } out_unlock: read_unlock(&tasklist_lock); @@ -244,47 +251,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) { + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) { + niceval = nice_to_rlimit(task_nice(p)); + if (niceval > retval) + retval = niceval; + } + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + niceval = nice_to_rlimit(task_nice(p)); + if (niceval > retval) + retval = niceval; + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + uid = make_kuid(cred->user_ns, who); + user = cred->user; + if (!who) + uid = cred->uid; + else if (!uid_eq(uid, cred->uid)) { + user = find_user(uid); + if (!user) + goto out_unlock; /* No processes for this user */ + } + do_each_thread(g, p) { + if (uid_eq(task_uid(p), uid)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - niceval = nice_to_rlimit(task_nice(p)); - if (niceval > retval) - retval = niceval; - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - uid = make_kuid(cred->user_ns, who); - user = cred->user; - if (!who) - uid = cred->uid; - else if (!uid_eq(uid, cred->uid) && - !(user = find_user(uid))) - goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) { - niceval = nice_to_rlimit(task_nice(p)); - if (niceval > retval) - retval = niceval; - } - } while_each_thread(g, p); - if (!uid_eq(uid, cred->uid)) - free_uid(user); /* for find_user() */ - break; + } while_each_thread(g, p); + if (!uid_eq(uid, cred->uid)) + free_uid(user); /* for find_user() */ + break; } out_unlock: read_unlock(&tasklist_lock); @@ -306,7 +314,7 @@ out_unlock: * * The general idea is that a program which uses just setregid() will be * 100% compatible with BSD. A program which uses just setgid() will be - * 100% compatible with POSIX with saved IDs. + * 100% compatible with POSIX with saved IDs. * * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned). @@ -364,7 +372,7 @@ error: } /* - * setgid() is implemented like SysV w/ SAVED_IDS + * setgid() is implemented like SysV w/ SAVED_IDS * * SMP: Same implicit races as above. */ @@ -442,7 +450,7 @@ static int set_user(struct cred *new) * * The general idea is that a program which uses just setreuid() will be * 100% compatible with BSD. A program which uses just setuid() will be - * 100% compatible with POSIX with saved IDs. + * 100% compatible with POSIX with saved IDs. */ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) { @@ -503,17 +511,17 @@ error: abort_creds(new); return retval; } - + /* - * setuid() is implemented like SysV with SAVED_IDS - * + * setuid() is implemented like SysV with SAVED_IDS + * * Note that SAVED_ID's is deficient in that a setuid root program - * like sendmail, for example, cannot set its uid to be a normal + * like sendmail, for example, cannot set its uid to be a normal * user and then switch back, because if you're root, setuid() sets * the saved uid too. If you don't like this, blame the bright people * in the POSIX committee and/or USG. Note that the BSD-style setreuid() * will allow a root program to temporarily drop privileges and be able to - * regain them by swapping the real and effective uid. + * regain them by swapping the real and effective uid. */ SYSCALL_DEFINE1(setuid, uid_t, uid) { @@ -637,10 +645,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _ euid = from_kuid_munged(cred->user_ns, cred->euid); suid = from_kuid_munged(cred->user_ns, cred->suid); - if (!(retval = put_user(ruid, ruidp)) && - !(retval = put_user(euid, euidp))) - retval = put_user(suid, suidp); - + retval = put_user(ruid, ruidp); + if (!retval) { + retval = put_user(euid, euidp); + if (!retval) + return put_user(suid, suidp); + } return retval; } @@ -709,9 +719,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _ egid = from_kgid_munged(cred->user_ns, cred->egid); sgid = from_kgid_munged(cred->user_ns, cred->sgid); - if (!(retval = put_user(rgid, rgidp)) && - !(retval = put_user(egid, egidp))) - retval = put_user(sgid, sgidp); + retval = put_user(rgid, rgidp); + if (!retval) { + retval = put_user(egid, egidp); + if (!retval) + retval = put_user(sgid, sgidp); + } return retval; } @@ -862,11 +875,9 @@ void do_sys_times(struct tms *tms) { cputime_t tgutime, tgstime, cutime, cstime; - spin_lock_irq(¤t->sighand->siglock); thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; - spin_unlock_irq(¤t->sighand->siglock); tms->tms_utime = cputime_to_clock_t(tgutime); tms->tms_stime = cputime_to_clock_t(tgstime); tms->tms_cutime = cputime_to_clock_t(cutime); @@ -1284,7 +1295,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) /* * Back compatibility for getrlimit. Needed for some apps. */ - SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { @@ -1299,7 +1309,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, x.rlim_cur = 0x7FFFFFFF; if (x.rlim_max > 0x7FFFFFFF) x.rlim_max = 0x7FFFFFFF; - return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; + return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; } #endif @@ -1527,7 +1537,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) cputime_t tgutime, tgstime, utime, stime; unsigned long maxrss = 0; - memset((char *) r, 0, sizeof *r); + memset((char *)r, 0, sizeof (*r)); utime = stime = 0; if (who == RUSAGE_THREAD) { @@ -1541,41 +1551,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) return; switch (who) { - case RUSAGE_BOTH: - case RUSAGE_CHILDREN: - utime = p->signal->cutime; - stime = p->signal->cstime; - r->ru_nvcsw = p->signal->cnvcsw; - r->ru_nivcsw = p->signal->cnivcsw; - r->ru_minflt = p->signal->cmin_flt; - r->ru_majflt = p->signal->cmaj_flt; - r->ru_inblock = p->signal->cinblock; - r->ru_oublock = p->signal->coublock; - maxrss = p->signal->cmaxrss; - - if (who == RUSAGE_CHILDREN) - break; - - case RUSAGE_SELF: - thread_group_cputime_adjusted(p, &tgutime, &tgstime); - utime += tgutime; - stime += tgstime; - r->ru_nvcsw += p->signal->nvcsw; - r->ru_nivcsw += p->signal->nivcsw; - r->ru_minflt += p->signal->min_flt; - r->ru_majflt += p->signal->maj_flt; - r->ru_inblock += p->signal->inblock; - r->ru_oublock += p->signal->oublock; - if (maxrss < p->signal->maxrss) - maxrss = p->signal->maxrss; - t = p; - do { - accumulate_thread_rusage(t, r); - } while_each_thread(p, t); + case RUSAGE_BOTH: + case RUSAGE_CHILDREN: + utime = p->signal->cutime; + stime = p->signal->cstime; + r->ru_nvcsw = p->signal->cnvcsw; + r->ru_nivcsw = p->signal->cnivcsw; + r->ru_minflt = p->signal->cmin_flt; + r->ru_majflt = p->signal->cmaj_flt; + r->ru_inblock = p->signal->cinblock; + r->ru_oublock = p->signal->coublock; + maxrss = p->signal->cmaxrss; + + if (who == RUSAGE_CHILDREN) break; - default: - BUG(); + case RUSAGE_SELF: + thread_group_cputime_adjusted(p, &tgutime, &tgstime); + utime += tgutime; + stime += tgstime; + r->ru_nvcsw += p->signal->nvcsw; + r->ru_nivcsw += p->signal->nivcsw; + r->ru_minflt += p->signal->min_flt; + r->ru_majflt += p->signal->maj_flt; + r->ru_inblock += p->signal->inblock; + r->ru_oublock += p->signal->oublock; + if (maxrss < p->signal->maxrss) + maxrss = p->signal->maxrss; + t = p; + do { + accumulate_thread_rusage(t, r); + } while_each_thread(p, t); + break; + + default: + BUG(); } unlock_task_sighand(p, &flags); @@ -1585,6 +1595,7 @@ out: if (who != RUSAGE_CHILDREN) { struct mm_struct *mm = get_task_mm(p); + if (mm) { setmax_mm_hiwater_rss(&maxrss, mm); mmput(mm); @@ -1596,6 +1607,7 @@ out: int getrusage(struct task_struct *p, int who, struct rusage __user *ru) { struct rusage r; + k_getrusage(p, who, &r); return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } @@ -1628,12 +1640,14 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } -static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) +static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) { struct fd exe; struct inode *inode; int err; + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); + exe = fdget(fd); if (!exe.file) return -EBADF; @@ -1654,8 +1668,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (err) goto exit; - down_write(&mm->mmap_sem); - /* * Forbid mm->exe_file change if old file still mapped. */ @@ -1667,7 +1679,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (vma->vm_file && path_equal(&vma->vm_file->f_path, &mm->exe_file->f_path)) - goto exit_unlock; + goto exit; } /* @@ -1678,34 +1690,222 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) */ err = -EPERM; if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) - goto exit_unlock; + goto exit; err = 0; set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ -exit_unlock: - up_write(&mm->mmap_sem); - exit: fdput(exe); return err; } +#ifdef CONFIG_CHECKPOINT_RESTORE +/* + * WARNING: we don't require any capability here so be very careful + * in what is allowed for modification from userspace. + */ +static int validate_prctl_map(struct prctl_mm_map *prctl_map) +{ + unsigned long mmap_max_addr = TASK_SIZE; + struct mm_struct *mm = current->mm; + int error = -EINVAL, i; + + static const unsigned char offsets[] = { + offsetof(struct prctl_mm_map, start_code), + offsetof(struct prctl_mm_map, end_code), + offsetof(struct prctl_mm_map, start_data), + offsetof(struct prctl_mm_map, end_data), + offsetof(struct prctl_mm_map, start_brk), + offsetof(struct prctl_mm_map, brk), + offsetof(struct prctl_mm_map, start_stack), + offsetof(struct prctl_mm_map, arg_start), + offsetof(struct prctl_mm_map, arg_end), + offsetof(struct prctl_mm_map, env_start), + offsetof(struct prctl_mm_map, env_end), + }; + + /* + * Make sure the members are not somewhere outside + * of allowed address space. + */ + for (i = 0; i < ARRAY_SIZE(offsets); i++) { + u64 val = *(u64 *)((char *)prctl_map + offsets[i]); + + if ((unsigned long)val >= mmap_max_addr || + (unsigned long)val < mmap_min_addr) + goto out; + } + + /* + * Make sure the pairs are ordered. + */ +#define __prctl_check_order(__m1, __op, __m2) \ + ((unsigned long)prctl_map->__m1 __op \ + (unsigned long)prctl_map->__m2) ? 0 : -EINVAL + error = __prctl_check_order(start_code, <, end_code); + error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_brk, <=, brk); + error |= __prctl_check_order(arg_start, <=, arg_end); + error |= __prctl_check_order(env_start, <=, env_end); + if (error) + goto out; +#undef __prctl_check_order + + error = -EINVAL; + + /* + * @brk should be after @end_data in traditional maps. + */ + if (prctl_map->start_brk <= prctl_map->end_data || + prctl_map->brk <= prctl_map->end_data) + goto out; + + /* + * Neither we should allow to override limits if they set. + */ + if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, + prctl_map->start_brk, prctl_map->end_data, + prctl_map->start_data)) + goto out; + + /* + * Someone is trying to cheat the auxv vector. + */ + if (prctl_map->auxv_size) { + if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) + goto out; + } + + /* + * Finally, make sure the caller has the rights to + * change /proc/pid/exe link: only local root should + * be allowed to. + */ + if (prctl_map->exe_fd != (u32)-1) { + struct user_namespace *ns = current_user_ns(); + const struct cred *cred = current_cred(); + + if (!uid_eq(cred->uid, make_kuid(ns, 0)) || + !gid_eq(cred->gid, make_kgid(ns, 0))) + goto out; + } + + error = 0; +out: + return error; +} + +static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) +{ + struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; + unsigned long user_auxv[AT_VECTOR_SIZE]; + struct mm_struct *mm = current->mm; + int error; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); + + if (opt == PR_SET_MM_MAP_SIZE) + return put_user((unsigned int)sizeof(prctl_map), + (unsigned int __user *)addr); + + if (data_size != sizeof(prctl_map)) + return -EINVAL; + + if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) + return -EFAULT; + + error = validate_prctl_map(&prctl_map); + if (error) + return error; + + if (prctl_map.auxv_size) { + memset(user_auxv, 0, sizeof(user_auxv)); + if (copy_from_user(user_auxv, + (const void __user *)prctl_map.auxv, + prctl_map.auxv_size)) + return -EFAULT; + + /* Last entry must be AT_NULL as specification requires */ + user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; + user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; + } + + down_write(&mm->mmap_sem); + if (prctl_map.exe_fd != (u32)-1) + error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd); + downgrade_write(&mm->mmap_sem); + if (error) + goto out; + + /* + * We don't validate if these members are pointing to + * real present VMAs because application may have correspond + * VMAs already unmapped and kernel uses these members for statistics + * output in procfs mostly, except + * + * - @start_brk/@brk which are used in do_brk but kernel lookups + * for VMAs when updating these memvers so anything wrong written + * here cause kernel to swear at userspace program but won't lead + * to any problem in kernel itself + */ + + mm->start_code = prctl_map.start_code; + mm->end_code = prctl_map.end_code; + mm->start_data = prctl_map.start_data; + mm->end_data = prctl_map.end_data; + mm->start_brk = prctl_map.start_brk; + mm->brk = prctl_map.brk; + mm->start_stack = prctl_map.start_stack; + mm->arg_start = prctl_map.arg_start; + mm->arg_end = prctl_map.arg_end; + mm->env_start = prctl_map.env_start; + mm->env_end = prctl_map.env_end; + + /* + * Note this update of @saved_auxv is lockless thus + * if someone reads this member in procfs while we're + * updating -- it may get partly updated results. It's + * known and acceptable trade off: we leave it as is to + * not introduce additional locks here making the kernel + * more complex. + */ + if (prctl_map.auxv_size) + memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); + + error = 0; +out: + up_read(&mm->mmap_sem); + return error; +} +#endif /* CONFIG_CHECKPOINT_RESTORE */ + static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { - unsigned long rlim = rlimit(RLIMIT_DATA); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int error; - if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) + if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && + opt != PR_SET_MM_MAP && + opt != PR_SET_MM_MAP_SIZE))) return -EINVAL; +#ifdef CONFIG_CHECKPOINT_RESTORE + if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) + return prctl_set_mm_map(opt, (const void __user *)addr, arg4); +#endif + if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (opt == PR_SET_MM_EXE_FILE) - return prctl_set_mm_exe_file(mm, (unsigned int)addr); + if (opt == PR_SET_MM_EXE_FILE) { + down_write(&mm->mmap_sem); + error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr); + up_write(&mm->mmap_sem); + return error; + } if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; @@ -1733,9 +1933,8 @@ static int prctl_set_mm(int opt, unsigned long addr, if (addr <= mm->end_data) goto out; - if (rlim < RLIM_INFINITY && - (mm->brk - addr) + - (mm->end_data - mm->start_data) > rlim) + if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr, + mm->end_data, mm->start_data)) goto out; mm->start_brk = addr; @@ -1745,9 +1944,8 @@ static int prctl_set_mm(int opt, unsigned long addr, if (addr <= mm->end_data) goto out; - if (rlim < RLIM_INFINITY && - (addr - mm->start_brk) + - (mm->end_data - mm->start_data) > rlim) + if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk, + mm->end_data, mm->start_data)) goto out; mm->brk = addr; @@ -2011,6 +2209,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, me->mm->def_flags &= ~VM_NOHUGEPAGE; up_write(&me->mm->mmap_sem); break; + case PR_MPX_ENABLE_MANAGEMENT: + error = MPX_ENABLE_MANAGEMENT(me); + break; + case PR_MPX_DISABLE_MANAGEMENT: + error = MPX_DISABLE_MANAGEMENT(me); + break; default: error = -EINVAL; break; @@ -2023,6 +2227,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, { int err = 0; int cpu = raw_smp_processor_id(); + if (cpup) err |= put_user(cpu, cpup); if (nodep) @@ -2135,7 +2340,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) /* Check to see if any memory value is too large for 32-bit and scale * down if needed */ - if ((s.totalram >> 32) || (s.totalswap >> 32)) { + if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) { int bitcount = 0; while (s.mem_unit < PAGE_SIZE) { diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 391d4ddb6f4b..5adcb0ae3a58 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -156,6 +156,9 @@ cond_syscall(sys_process_vm_writev); cond_syscall(compat_sys_process_vm_readv); cond_syscall(compat_sys_process_vm_writev); cond_syscall(sys_uselib); +cond_syscall(sys_fadvise64); +cond_syscall(sys_fadvise64_64); +cond_syscall(sys_madvise); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); @@ -166,6 +169,8 @@ cond_syscall(ppc_rtas); cond_syscall(sys_spu_run); cond_syscall(sys_spu_create); cond_syscall(sys_subpage_prot); +cond_syscall(sys_s390_pci_mmio_read); +cond_syscall(sys_s390_pci_mmio_write); /* mmu depending weak syscall entries */ cond_syscall(sys_mprotect); @@ -218,3 +223,9 @@ cond_syscall(sys_kcmp); /* operate on Secure Computing state */ cond_syscall(sys_seccomp); + +/* access BPF programs and maps */ +cond_syscall(sys_bpf); + +/* execveat */ +cond_syscall(sys_execveat); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75875a741b5e..137c7f69b264 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = { .data = &sysctl_numa_balancing_scan_size, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "numa_balancing", @@ -622,6 +623,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tracepoint_printk", + .data = &tracepoint_printk, + .maxlen = sizeof(tracepoint_printk), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif #ifdef CONFIG_KEXEC { @@ -1055,15 +1063,6 @@ static struct ctl_table kern_table[] = { .child = key_sysctls, }, #endif -#ifdef CONFIG_RCU_TORTURE_TEST - { - .procname = "rcutorture_runnable", - .data = &rcutorture_runnable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_PERF_EVENTS /* * User-space scripts rely on the existence of this file @@ -1112,6 +1111,15 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif + { + .procname = "panic_on_warn", + .data = &panic_on_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { } }; @@ -1460,13 +1468,6 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif - { - .procname = "scan_unevictable_pages", - .data = &scan_unevictable_pages, - .maxlen = sizeof(scan_unevictable_pages), - .mode = 0644, - .proc_handler = scan_unevictable_handler, - }, #ifdef CONFIG_MEMORY_FAILURE { .procname = "memory_failure_early_kill", diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index e4ba9a5a5ccb..7e7746a42a62 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = { { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, + { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, {} }; @@ -390,7 +391,6 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, - { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 13d2f7cd65db..670fff88a961 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -459,7 +459,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) stats = nla_data(na); memset(stats, 0, sizeof(*stats)); - rc = cgroupstats_build(stats, f.file->f_dentry); + rc = cgroupstats_build(stats, f.file->f_path.dentry); if (rc < 0) { nlmsg_free(rep_skb); goto err; @@ -638,7 +638,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) fill_tgid_exit(tsk); } - listeners = __this_cpu_ptr(&listener_array); + listeners = raw_cpu_ptr(&listener_array); if (list_empty(&listeners->list)) return; diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 7347426fa68d..f622cf28628a 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o -obj-$(CONFIG_TEST_UDELAY) += udelay_test.o +obj-$(CONFIG_TEST_UDELAY) += test_udelay.o $(obj)/time.o: $(obj)/timeconst.h diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9c94c19f1305..55449909f114 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, * Also omit the add if it would overflow the u64 boundary. */ if ((~0ULL - clc > rnd) && - (!ismax || evt->mult <= (1U << evt->shift))) + (!ismax || evt->mult <= (1ULL << evt->shift))) clc += rnd; do_div(clc, evt->mult); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 2e949cc9c9f1..b79f39bda7e1 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -792,7 +792,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) /* Initialize mult/shift and max_idle_ns */ __clocksource_updatefreq_scale(cs, scale, freq); - /* Add clocksource to the clcoksource list */ + /* Add clocksource to the clocksource list */ mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1c2fe7de2842..37e50aadd471 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -558,7 +558,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) static int hrtimer_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); int res; @@ -629,7 +629,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) */ static void retrigger_next_event(void *arg) { - struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (!hrtimer_hres_active()) return; @@ -903,7 +903,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) */ debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); - reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); + reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); /* * We must preserve the CALLBACK state flag here, * otherwise we could move the timer base in @@ -963,7 +963,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * on dynticks target. */ wake_up_nohz_cpu(new_base->cpu_base->cpu); - } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) && + } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) && hrtimer_reprogram(timer, new_base)) { /* * Only allow reprogramming if the new base is on this CPU. @@ -1103,7 +1103,7 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); */ ktime_t hrtimer_get_next_event(void) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; unsigned long flags; @@ -1144,7 +1144,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, memset(timer, 0, sizeof(struct hrtimer)); - cpu_base = &__raw_get_cpu_var(hrtimer_bases); + cpu_base = raw_cpu_ptr(&hrtimer_bases); if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) clock_id = CLOCK_MONOTONIC; @@ -1187,7 +1187,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) struct hrtimer_cpu_base *cpu_base; int base = hrtimer_clockid_to_base(which_clock); - cpu_base = &__raw_get_cpu_var(hrtimer_bases); + cpu_base = raw_cpu_ptr(&hrtimer_bases); *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); return 0; @@ -1242,7 +1242,7 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) */ void hrtimer_interrupt(struct clock_event_device *dev) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; int i, retries = 0; @@ -1376,7 +1376,7 @@ static void __hrtimer_peek_ahead_timers(void) if (!hrtimer_hres_active()) return; - td = &__get_cpu_var(tick_cpu_device); + td = this_cpu_ptr(&tick_cpu_device); if (td && td->evtdev) hrtimer_interrupt(td->evtdev); } @@ -1440,7 +1440,7 @@ void hrtimer_run_pending(void) void hrtimer_run_queues(void) { struct timerqueue_node *node; - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base; int index, gettime = 1; @@ -1679,7 +1679,7 @@ static void migrate_hrtimers(int scpu) local_irq_disable(); old_base = &per_cpu(hrtimer_bases, scpu); - new_base = &__get_cpu_var(hrtimer_bases); + new_base = this_cpu_ptr(&hrtimer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. @@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, */ if (!expires) { schedule(); - __set_current_state(TASK_RUNNING); return -EINTR; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5f..a16b67859e2a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, if (same_thread_group(tsk, current)) err = cpu_clock_sample(which_clock, tsk, &rtn); } else { - unsigned long flags; - struct sighand_struct *sighand; - - /* - * while_each_thread() is not yet entirely RCU safe, - * keep locking the group while sampling process - * clock for now. - */ - sighand = lock_task_sighand(tsk, &flags); - if (!sighand) - return err; - if (tsk == current || thread_group_leader(tsk)) err = cpu_clock_sample_group(which_clock, tsk, &rtn); - - unlock_task_sighand(tsk, &flags); } if (!err) @@ -567,7 +553,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock, *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: - *sample = cputime.sum_exec_runtime + task_delta_exec(p); + *sample = cputime.sum_exec_runtime; break; } return 0; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 42b463ad90f2..31ea01f42e1f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, goto out; } } else { + memset(&event.sigev_value, 0, sizeof(event.sigev_value)); event.sigev_notify = SIGEV_SIGNAL; event.sigev_signo = SIGALRM; event.sigev_value.sival_int = new_timer->it_id; diff --git a/kernel/time/udelay_test.c b/kernel/time/test_udelay.c index e622ba365a13..e622ba365a13 100644 --- a/kernel/time/udelay_test.c +++ b/kernel/time/test_udelay.c diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 64c5990fd500..066f0ec05e48 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -554,7 +554,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) void tick_check_oneshot_broadcast_this_cpu(void) { if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); /* * We might be in the middle of switching over from diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 0a0608edeb26..7efeedf53ebd 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -224,7 +224,7 @@ static void tick_setup_device(struct tick_device *td, void tick_install_replacement(struct clock_event_device *newdev) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); int cpu = smp_processor_id(); clockevents_exchange_device(td->evtdev, newdev); @@ -374,14 +374,14 @@ void tick_shutdown(unsigned int *cpup) void tick_suspend(void) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); clockevents_shutdown(td->evtdev); } void tick_resume(void) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); int broadcast = tick_resume_broadcast(); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); @@ -400,4 +400,5 @@ void tick_resume(void) void __init tick_init(void) { tick_broadcast_init(); + tick_nohz_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index c19c1d84b6f3..366aeb4f2c66 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -99,6 +99,13 @@ static inline int tick_broadcast_oneshot_active(void) { return 0; } static inline bool tick_broadcast_oneshot_available(void) { return false; } #endif /* !TICK_ONESHOT */ +/* NO_HZ_FULL internal */ +#ifdef CONFIG_NO_HZ_FULL +extern void tick_nohz_init(void); +# else +static inline void tick_nohz_init(void) { } +#endif + /* * Broadcasting support */ diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 824109060a33..7ce740e78e1b 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -59,7 +59,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, */ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *dev = td->evtdev; if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f654a8a298fa..1363d58f07e9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -205,7 +205,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); */ void __tick_nohz_full_check(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_nohz_full_cpu(smp_processor_id())) { if (ts->tick_stopped && !is_idle_task(current)) { @@ -235,7 +235,7 @@ void tick_nohz_full_kick(void) if (!tick_nohz_full_cpu(smp_processor_id())) return; - irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); + irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); } /* @@ -295,22 +295,12 @@ out: /* Parse the boot-time nohz CPU list from the kernel parameters. */ static int __init tick_nohz_full_setup(char *str) { - int cpu; - alloc_bootmem_cpumask_var(&tick_nohz_full_mask); - alloc_bootmem_cpumask_var(&housekeeping_mask); if (cpulist_parse(str, tick_nohz_full_mask) < 0) { pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); + free_bootmem_cpumask_var(tick_nohz_full_mask); return 1; } - - cpu = smp_processor_id(); - if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { - pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); - cpumask_clear_cpu(cpu, tick_nohz_full_mask); - } - cpumask_andnot(housekeeping_mask, - cpu_possible_mask, tick_nohz_full_mask); tick_nohz_full_running = true; return 1; @@ -349,18 +339,11 @@ static int tick_nohz_init_all(void) #ifdef CONFIG_NO_HZ_FULL_ALL if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { - pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); - return err; - } - if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { - pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n"); + WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); return err; } err = 0; cpumask_setall(tick_nohz_full_mask); - cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); - cpumask_clear(housekeeping_mask); - cpumask_set_cpu(smp_processor_id(), housekeeping_mask); tick_nohz_full_running = true; #endif return err; @@ -375,6 +358,37 @@ void __init tick_nohz_init(void) return; } + if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { + WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); + cpumask_clear(tick_nohz_full_mask); + tick_nohz_full_running = false; + return; + } + + /* + * Full dynticks uses irq work to drive the tick rescheduling on safe + * locking contexts. But then we need irq work to raise its own + * interrupts to avoid circular dependency on the tick + */ + if (!arch_irq_work_has_interrupt()) { + pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " + "support irq work self-IPIs\n"); + cpumask_clear(tick_nohz_full_mask); + cpumask_copy(housekeeping_mask, cpu_possible_mask); + tick_nohz_full_running = false; + return; + } + + cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { + pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + cpumask_clear_cpu(cpu, tick_nohz_full_mask); + } + + cpumask_andnot(housekeeping_mask, + cpu_possible_mask, tick_nohz_full_mask); + for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); @@ -559,7 +573,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; ktime_t last_update, expires, ret = { .tv64 = 0 }; unsigned long rcu_delta_jiffies; - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 time_delta; time_delta = timekeeping_max_deferment(); @@ -571,8 +585,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, last_jiffies = jiffies; } while (read_seqretry(&jiffies_lock, seq)); - if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || - arch_needs_cpu(cpu) || irq_work_needs_cpu()) { + if (rcu_needs_cpu(&rcu_delta_jiffies) || + arch_needs_cpu() || irq_work_needs_cpu()) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { @@ -827,13 +841,12 @@ void tick_nohz_idle_enter(void) local_irq_disable(); - ts = &__get_cpu_var(tick_cpu_sched); + ts = this_cpu_ptr(&tick_cpu_sched); ts->inidle = 1; __tick_nohz_idle_enter(ts); local_irq_enable(); } -EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); /** * tick_nohz_irq_exit - update next tick event from interrupt exit @@ -845,7 +858,7 @@ EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); */ void tick_nohz_irq_exit(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->inidle) __tick_nohz_idle_enter(ts); @@ -860,7 +873,7 @@ void tick_nohz_irq_exit(void) */ ktime_t tick_nohz_get_sleep_length(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return ts->sleep_length; } @@ -938,7 +951,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) */ void tick_nohz_idle_exit(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; local_irq_disable(); @@ -960,7 +973,6 @@ void tick_nohz_idle_exit(void) local_irq_enable(); } -EXPORT_SYMBOL_GPL(tick_nohz_idle_exit); static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) { @@ -973,7 +985,7 @@ static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) */ static void tick_nohz_handler(struct clock_event_device *dev) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); @@ -982,6 +994,10 @@ static void tick_nohz_handler(struct clock_event_device *dev) tick_sched_do_timer(now); tick_sched_handle(ts, regs); + /* No need to reprogram if we are running tickless */ + if (unlikely(ts->tick_stopped)) + return; + while (tick_nohz_reprogram(ts, now)) { now = ktime_get(); tick_do_update_jiffies64(now); @@ -993,7 +1009,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) */ static void tick_nohz_switch_to_nohz(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t next; if (!tick_nohz_enabled) @@ -1055,7 +1071,7 @@ static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) static inline void tick_nohz_irq_enter(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!ts->idle_active && !ts->tick_stopped) @@ -1109,6 +1125,10 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) if (regs) tick_sched_handle(ts, regs); + /* No need to reprogram if we are in idle or full dynticks mode */ + if (unlikely(ts->tick_stopped)) + return HRTIMER_NORESTART; + hrtimer_forward(timer, now, tick_period); return HRTIMER_RESTART; @@ -1129,7 +1149,7 @@ early_param("skew_tick", skew_tick); */ void tick_setup_sched_timer(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now = ktime_get(); /* @@ -1198,7 +1218,7 @@ void tick_clock_notify(void) */ void tick_oneshot_notify(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); set_bit(0, &ts->check_clocks); } @@ -1213,7 +1233,7 @@ void tick_oneshot_notify(void) */ int tick_check_oneshot_change(int allow_nohz) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; diff --git a/kernel/time/time.c b/kernel/time/time.c index a9ae20fb0b11..6390517e77d4 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -304,7 +304,9 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) } EXPORT_SYMBOL(timespec_trunc); -/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. +/* + * mktime64 - Converts date to seconds. + * Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. * @@ -314,15 +316,10 @@ EXPORT_SYMBOL(timespec_trunc); * -year/100+year/400 terms, and add 10.] * * This algorithm was first published by Gauss (I think). - * - * WARNING: this function will overflow on 2106-02-07 06:28:16 on - * machines where long is 32-bit! (However, as time_t is signed, we - * will already get problems at other places on 2038-01-19 03:14:08) */ -unsigned long -mktime(const unsigned int year0, const unsigned int mon0, - const unsigned int day, const unsigned int hour, - const unsigned int min, const unsigned int sec) +time64_t mktime64(const unsigned int year0, const unsigned int mon0, + const unsigned int day, const unsigned int hour, + const unsigned int min, const unsigned int sec) { unsigned int mon = mon0, year = year0; @@ -332,15 +329,14 @@ mktime(const unsigned int year0, const unsigned int mon0, year -= 1; } - return ((((unsigned long) + return ((((time64_t) (year/4 - year/100 + year/400 + 367*mon/12 + day) + year*365 - 719499 )*24 + hour /* now have hours */ )*60 + min /* now have minutes */ )*60 + sec; /* finally seconds */ } - -EXPORT_SYMBOL(mktime); +EXPORT_SYMBOL(mktime64); /** * set_normalized_timespec - set timespec sec and nsec parts and normalize @@ -745,6 +741,7 @@ u64 nsecs_to_jiffies64(u64 n) return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); #endif } +EXPORT_SYMBOL(nsecs_to_jiffies64); /** * nsecs_to_jiffies - Convert nsecs in u64 to jiffies diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ec1791fae965..6a931852082f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -417,7 +417,8 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); */ static inline void tk_update_ktime_data(struct timekeeper *tk) { - s64 nsec; + u64 seconds; + u32 nsec; /* * The xtime based monotonic readout is: @@ -426,13 +427,22 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) * nsec = base_mono + now(); * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec */ - nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); - nsec *= NSEC_PER_SEC; - nsec += tk->wall_to_monotonic.tv_nsec; - tk->tkr.base_mono = ns_to_ktime(nsec); + seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); + nsec = (u32) tk->wall_to_monotonic.tv_nsec; + tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); /* Update the monotonic raw base */ tk->base_raw = timespec64_to_ktime(tk->raw_time); + + /* + * The sum of the nanoseconds portions of xtime and + * wall_to_monotonic can be greater/equal one second. Take + * this into account before updating tk->ktime_sec. + */ + nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift); + if (nsec >= NSEC_PER_SEC) + seconds++; + tk->ktime_sec = seconds; } /* must hold timekeeper_lock */ @@ -519,9 +529,9 @@ EXPORT_SYMBOL(__getnstimeofday64); /** * getnstimeofday64 - Returns the time of day in a timespec64. - * @ts: pointer to the timespec to be set + * @ts: pointer to the timespec64 to be set * - * Returns the time of day in a timespec (WARN if suspended). + * Returns the time of day in a timespec64 (WARN if suspended). */ void getnstimeofday64(struct timespec64 *ts) { @@ -623,7 +633,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw); * * The function calculates the monotonic clock from the realtime * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. + * in normalized timespec64 format in the variable pointed to by @ts. */ void ktime_get_ts64(struct timespec64 *ts) { @@ -648,6 +658,54 @@ void ktime_get_ts64(struct timespec64 *ts) } EXPORT_SYMBOL_GPL(ktime_get_ts64); +/** + * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC + * + * Returns the seconds portion of CLOCK_MONOTONIC with a single non + * serialized read. tk->ktime_sec is of type 'unsigned long' so this + * works on both 32 and 64 bit systems. On 32 bit systems the readout + * covers ~136 years of uptime which should be enough to prevent + * premature wrap arounds. + */ +time64_t ktime_get_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + WARN_ON(timekeeping_suspended); + return tk->ktime_sec; +} +EXPORT_SYMBOL_GPL(ktime_get_seconds); + +/** + * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME + * + * Returns the wall clock seconds since 1970. This replaces the + * get_seconds() interface which is not y2038 safe on 32bit systems. + * + * For 64bit systems the fast access to tk->xtime_sec is preserved. On + * 32bit systems the access must be protected with the sequence + * counter to provide "atomic" access to the 64bit tk->xtime_sec + * value. + */ +time64_t ktime_get_real_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + time64_t seconds; + unsigned int seq; + + if (IS_ENABLED(CONFIG_64BIT)) + return tk->xtime_sec; + + do { + seq = read_seqcount_begin(&tk_core.seq); + seconds = tk->xtime_sec; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return seconds; +} +EXPORT_SYMBOL_GPL(ktime_get_real_seconds); + #ifdef CONFIG_NTP_PPS /** @@ -703,18 +761,18 @@ void do_gettimeofday(struct timeval *tv) EXPORT_SYMBOL(do_gettimeofday); /** - * do_settimeofday - Sets the time of day - * @tv: pointer to the timespec variable containing the new time + * do_settimeofday64 - Sets the time of day. + * @ts: pointer to the timespec64 variable containing the new time * * Sets the time of day to the new time and update NTP and notify hrtimers */ -int do_settimeofday(const struct timespec *tv) +int do_settimeofday64(const struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 ts_delta, xt, tmp; + struct timespec64 ts_delta, xt; unsigned long flags; - if (!timespec_valid_strict(tv)) + if (!timespec64_valid_strict(ts)) return -EINVAL; raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -723,13 +781,12 @@ int do_settimeofday(const struct timespec *tv) timekeeping_forward_now(tk); xt = tk_xtime(tk); - ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; - ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; + ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; + ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); - tmp = timespec_to_timespec64(*tv); - tk_set_xtime(tk, &tmp); + tk_set_xtime(tk, ts); timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); @@ -741,7 +798,7 @@ int do_settimeofday(const struct timespec *tv) return 0; } -EXPORT_SYMBOL(do_settimeofday); +EXPORT_SYMBOL(do_settimeofday64); /** * timekeeping_inject_offset - Adds or subtracts from the current time. @@ -895,12 +952,12 @@ int timekeeping_notify(struct clocksource *clock) } /** - * getrawmonotonic - Returns the raw monotonic time in a timespec - * @ts: pointer to the timespec to be set + * getrawmonotonic64 - Returns the raw monotonic time in a timespec + * @ts: pointer to the timespec64 to be set * * Returns the raw monotonic time (completely un-modified by ntp) */ -void getrawmonotonic(struct timespec *ts) +void getrawmonotonic64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts64; @@ -915,9 +972,10 @@ void getrawmonotonic(struct timespec *ts) } while (read_seqcount_retry(&tk_core.seq, seq)); timespec64_add_ns(&ts64, nsecs); - *ts = timespec64_to_timespec(ts64); + *ts = ts64; } -EXPORT_SYMBOL(getrawmonotonic); +EXPORT_SYMBOL(getrawmonotonic64); + /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres @@ -1068,8 +1126,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, } /** - * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values - * @delta: pointer to a timespec delta value + * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values + * @delta: pointer to a timespec64 delta value * * This hook is for architectures that cannot support read_persistent_clock * because their RTC/persistent clock is only accessible when irqs are enabled. @@ -1077,10 +1135,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, * This function should only be called by rtc_resume(), and allows * a suspend offset to be injected into the timekeeping values. */ -void timekeeping_inject_sleeptime(struct timespec *delta) +void timekeeping_inject_sleeptime64(struct timespec64 *delta) { struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 tmp; unsigned long flags; /* @@ -1095,8 +1152,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) timekeeping_forward_now(tk); - tmp = timespec_to_timespec64(*delta); - __timekeeping_inject_sleeptime(tk, &tmp); + __timekeeping_inject_sleeptime(tk, delta); timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); @@ -1332,6 +1388,12 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, * * XXX - TODO: Doc ntp_error calculation. */ + if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) { + /* NTP adjustment caused clocksource mult overflow */ + WARN_ON_ONCE(1); + return; + } + tk->tkr.mult += mult_adj; tk->xtime_interval += interval; tk->tkr.xtime_nsec -= offset; @@ -1397,7 +1459,8 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) } if (unlikely(tk->tkr.clock->maxadj && - (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) { + (abs(tk->tkr.mult - tk->tkr.clock->mult) + > tk->tkr.clock->maxadj))) { printk_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", tk->tkr.clock->name, (long)tk->tkr.mult, @@ -1646,7 +1709,7 @@ struct timespec current_kernel_time(void) } EXPORT_SYMBOL(current_kernel_time); -struct timespec get_monotonic_coarse(void) +struct timespec64 get_monotonic_coarse64(void) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; @@ -1662,7 +1725,7 @@ struct timespec get_monotonic_coarse(void) set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); - return timespec64_to_timespec(now); + return now; } /* diff --git a/kernel/time/timer.c b/kernel/time/timer.c index aca5dfe2fa3d..2d3f5c504939 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -655,7 +655,7 @@ static inline void debug_assert_init(struct timer_list *timer) static void do_init_timer(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key) { - struct tvec_base *base = __raw_get_cpu_var(tvec_bases); + struct tvec_base *base = raw_cpu_read(tvec_bases); timer->entry.next = NULL; timer->base = (void *)((unsigned long)base | flags); @@ -1377,15 +1377,14 @@ unsigned long get_next_timer_interrupt(unsigned long now) void update_process_times(int user_tick) { struct task_struct *p = current; - int cpu = smp_processor_id(); /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); - rcu_check_callbacks(cpu, user_tick); + rcu_check_callbacks(user_tick); #ifdef CONFIG_IRQ_WORK if (in_irq()) - irq_work_run(); + irq_work_tick(); #endif scheduler_tick(); run_posix_cpu_timers(p); diff --git a/kernel/torture.c b/kernel/torture.c index d600af21f022..dd70993c266c 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -211,18 +211,16 @@ EXPORT_SYMBOL_GPL(torture_onoff_cleanup); /* * Print online/offline testing statistics. */ -char *torture_onoff_stats(char *page) +void torture_onoff_stats(void) { #ifdef CONFIG_HOTPLUG_CPU - page += sprintf(page, - "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", - n_online_successes, n_online_attempts, - n_offline_successes, n_offline_attempts, - min_online, max_online, - min_offline, max_offline, - sum_online, sum_offline, HZ); + pr_cont("onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", + n_online_successes, n_online_attempts, + n_offline_successes, n_offline_attempts, + min_online, max_online, + min_offline, max_offline, + sum_online, sum_offline, HZ); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ - return page; } EXPORT_SYMBOL_GPL(torture_onoff_stats); @@ -635,8 +633,13 @@ EXPORT_SYMBOL_GPL(torture_init_end); * * This must be called before the caller starts shutting down its own * kthreads. + * + * Both torture_cleanup_begin() and torture_cleanup_end() must be paired, + * in order to correctly perform the cleanup. They are separated because + * threads can still need to reference the torture_type type, thus nullify + * only after completing all other relevant calls. */ -bool torture_cleanup(void) +bool torture_cleanup_begin(void) { mutex_lock(&fullstop_mutex); if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { @@ -651,12 +654,17 @@ bool torture_cleanup(void) torture_shuffle_cleanup(); torture_stutter_cleanup(); torture_onoff_cleanup(); + return false; +} +EXPORT_SYMBOL_GPL(torture_cleanup_begin); + +void torture_cleanup_end(void) +{ mutex_lock(&fullstop_mutex); torture_type = NULL; mutex_unlock(&fullstop_mutex); - return false; } -EXPORT_SYMBOL_GPL(torture_cleanup); +EXPORT_SYMBOL_GPL(torture_cleanup_end); /* * Is it time for the current torture test to stop? diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 67d6369ddf83..979ccde26720 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -55,7 +55,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o -ifeq ($(CONFIG_PM_RUNTIME),y) +ifeq ($(CONFIG_PM),y) obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o endif ifeq ($(CONFIG_TRACING),y) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c1bd4ada2a04..483cecfa5c17 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1142,9 +1142,9 @@ static void get_pdu_remap(const struct trace_entry *ent, r->sector_from = be64_to_cpu(sector_from); } -typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); +typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act); -static int blk_log_action_classic(struct trace_iterator *iter, const char *act) +static void blk_log_action_classic(struct trace_iterator *iter, const char *act) { char rwbs[RWBS_LEN]; unsigned long long ts = iter->ts; @@ -1154,33 +1154,33 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act) fill_rwbs(rwbs, t); - return trace_seq_printf(&iter->seq, - "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", - MAJOR(t->device), MINOR(t->device), iter->cpu, - secs, nsec_rem, iter->ent->pid, act, rwbs); + trace_seq_printf(&iter->seq, + "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", + MAJOR(t->device), MINOR(t->device), iter->cpu, + secs, nsec_rem, iter->ent->pid, act, rwbs); } -static int blk_log_action(struct trace_iterator *iter, const char *act) +static void blk_log_action(struct trace_iterator *iter, const char *act) { char rwbs[RWBS_LEN]; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); - return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", - MAJOR(t->device), MINOR(t->device), act, rwbs); + trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); } -static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) { const unsigned char *pdu_buf; int pdu_len; - int i, end, ret; + int i, end; pdu_buf = pdu_start(ent); pdu_len = te_blk_io_trace(ent)->pdu_len; if (!pdu_len) - return 1; + return; /* find the last zero that needs to be printed */ for (end = pdu_len - 1; end >= 0; end--) @@ -1188,119 +1188,107 @@ static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) break; end++; - if (!trace_seq_putc(s, '(')) - return 0; + trace_seq_putc(s, '('); for (i = 0; i < pdu_len; i++) { - ret = trace_seq_printf(s, "%s%02x", - i == 0 ? "" : " ", pdu_buf[i]); - if (!ret) - return ret; + trace_seq_printf(s, "%s%02x", + i == 0 ? "" : " ", pdu_buf[i]); /* * stop when the rest is just zeroes and indicate so * with a ".." appended */ - if (i == end && end != pdu_len - 1) - return trace_seq_puts(s, " ..) "); + if (i == end && end != pdu_len - 1) { + trace_seq_puts(s, " ..) "); + return; + } } - return trace_seq_puts(s, ") "); + trace_seq_puts(s, ") "); } -static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { - int ret; - - ret = trace_seq_printf(s, "%u ", t_bytes(ent)); - if (!ret) - return 0; - ret = blk_log_dump_pdu(s, ent); - if (!ret) - return 0; - return trace_seq_printf(s, "[%s]\n", cmd); + trace_seq_printf(s, "%u ", t_bytes(ent)); + blk_log_dump_pdu(s, ent); + trace_seq_printf(s, "[%s]\n", cmd); } else { if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%s]\n", + trace_seq_printf(s, "%llu + %u [%s]\n", t_sector(ent), t_sec(ent), cmd); - return trace_seq_printf(s, "[%s]\n", cmd); + else + trace_seq_printf(s, "[%s]\n", cmd); } } -static int blk_log_with_error(struct trace_seq *s, +static void blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent) { if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { - int ret; - - ret = blk_log_dump_pdu(s, ent); - if (ret) - return trace_seq_printf(s, "[%d]\n", t_error(ent)); - return 0; + blk_log_dump_pdu(s, ent); + trace_seq_printf(s, "[%d]\n", t_error(ent)); } else { if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%d]\n", - t_sector(ent), - t_sec(ent), t_error(ent)); - return trace_seq_printf(s, "%llu [%d]\n", - t_sector(ent), t_error(ent)); + trace_seq_printf(s, "%llu + %u [%d]\n", + t_sector(ent), + t_sec(ent), t_error(ent)); + else + trace_seq_printf(s, "%llu [%d]\n", + t_sector(ent), t_error(ent)); } } -static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) { struct blk_io_trace_remap r = { .device_from = 0, }; get_pdu_remap(ent, &r); - return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", - t_sector(ent), t_sec(ent), - MAJOR(r.device_from), MINOR(r.device_from), - (unsigned long long)r.sector_from); + trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", + t_sector(ent), t_sec(ent), + MAJOR(r.device_from), MINOR(r.device_from), + (unsigned long long)r.sector_from); } -static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - return trace_seq_printf(s, "[%s]\n", cmd); + trace_seq_printf(s, "[%s]\n", cmd); } -static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); + trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); } -static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), - get_pdu_int(ent), cmd); + trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), + get_pdu_int(ent), cmd); } -static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) { - int ret; const struct blk_io_trace *t = te_blk_io_trace(ent); - ret = trace_seq_putmem(s, t + 1, t->pdu_len); - if (ret) - return trace_seq_putc(s, '\n'); - return ret; + trace_seq_putmem(s, t + 1, t->pdu_len); + trace_seq_putc(s, '\n'); } /* @@ -1339,7 +1327,7 @@ static void blk_tracer_reset(struct trace_array *tr) static const struct { const char *act[2]; - int (*print)(struct trace_seq *s, const struct trace_entry *ent); + void (*print)(struct trace_seq *s, const struct trace_entry *ent); } what2act[] = { [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, @@ -1364,7 +1352,6 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; const struct blk_io_trace *t; u16 what; - int ret; bool long_act; blk_log_action_t *log_action; @@ -1374,21 +1361,18 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, log_action = classic ? &blk_log_action_classic : &blk_log_action; if (t->action == BLK_TN_MESSAGE) { - ret = log_action(iter, long_act ? "message" : "m"); - if (ret) - ret = blk_log_msg(s, iter->ent); - goto out; + log_action(iter, long_act ? "message" : "m"); + blk_log_msg(s, iter->ent); } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) - ret = trace_seq_printf(s, "Unknown action %x\n", what); + trace_seq_printf(s, "Unknown action %x\n", what); else { - ret = log_action(iter, what2act[what].act[long_act]); - if (ret) - ret = what2act[what].print(s, iter->ent); + log_action(iter, what2act[what].act[long_act]); + what2act[what].print(s, iter->ent); } -out: - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + + return trace_handle_return(s); } static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, @@ -1397,7 +1381,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, return print_one_line(iter, false); } -static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) +static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; @@ -1407,18 +1391,18 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) .time = iter->ts, }; - if (!trace_seq_putmem(s, &old, offset)) - return 0; - return trace_seq_putmem(s, &t->sector, - sizeof(old) - offset + t->pdu_len); + trace_seq_putmem(s, &old, offset); + trace_seq_putmem(s, &t->sector, + sizeof(old) - offset + t->pdu_len); } static enum print_line_t blk_trace_event_print_binary(struct trace_iterator *iter, int flags, struct trace_event *event) { - return blk_trace_synthesize_old_trace(iter) ? - TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + blk_trace_synthesize_old_trace(iter); + + return trace_handle_return(&iter->seq); } static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) @@ -1493,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q) if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); - spin_lock_irq(&running_trace_lock); - list_del(&bt->running_list); - spin_unlock_irq(&running_trace_lock); blk_trace_free(bt); return 0; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5916a8e59e87..929a733d302e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -113,6 +113,9 @@ ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; static struct ftrace_ops control_ops; +static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs); + #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs); @@ -251,18 +254,24 @@ static void update_ftrace_function(void) ftrace_func_t func; /* + * Prepare the ftrace_ops that the arch callback will use. + * If there's only one ftrace_ops registered, the ftrace_ops_list + * will point to the ops we want. + */ + set_function_trace_op = ftrace_ops_list; + + /* If there's no ftrace_ops registered, just call the stub function */ + if (ftrace_ops_list == &ftrace_list_end) { + func = ftrace_stub; + + /* * If we are at the end of the list and this ops is * recursion safe and not dynamic and the arch supports passing ops, * then have the mcount trampoline call the function directly. */ - if (ftrace_ops_list == &ftrace_list_end || - (ftrace_ops_list->next == &ftrace_list_end && - !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && - (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && - !FTRACE_FORCE_LIST_FUNC)) { - /* Set the ftrace_ops that the arch callback uses */ - set_function_trace_op = ftrace_ops_list; - func = ftrace_ops_list->func; + } else if (ftrace_ops_list->next == &ftrace_list_end) { + func = ftrace_ops_get_func(ftrace_ops_list); + } else { /* Just use the default ftrace_ops */ set_function_trace_op = &ftrace_list_end; @@ -378,6 +387,8 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, return ret; } +static void ftrace_update_trampoline(struct ftrace_ops *ops); + static int __register_ftrace_function(struct ftrace_ops *ops) { if (ops->flags & FTRACE_OPS_FL_DELETED) @@ -407,9 +418,13 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (control_ops_alloc(ops)) return -ENOMEM; add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); + /* The control_ops needs the trampoline update */ + ops = &control_ops; } else add_ftrace_ops(&ftrace_ops_list, ops); + ftrace_update_trampoline(ops); + if (ftrace_enabled) update_ftrace_function(); @@ -556,13 +571,13 @@ static int function_stat_cmp(void *p1, void *p2) static int function_stat_headers(struct seq_file *m) { #ifdef CONFIG_FUNCTION_GRAPH_TRACER - seq_printf(m, " Function " - "Hit Time Avg s^2\n" - " -------- " - "--- ---- --- ---\n"); + seq_puts(m, " Function " + "Hit Time Avg s^2\n" + " -------- " + "--- ---- --- ---\n"); #else - seq_printf(m, " Function Hit\n" - " -------- ---\n"); + seq_puts(m, " Function Hit\n" + " -------- ---\n"); #endif return 0; } @@ -589,7 +604,7 @@ static int function_stat_show(struct seq_file *m, void *v) seq_printf(m, " %-30.30s %10lu", str, rec->counter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - seq_printf(m, " "); + seq_puts(m, " "); avg = rec->time; do_div(avg, rec->counter); @@ -1048,6 +1063,12 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid; static struct ftrace_ops *removed_ops; +/* + * Set when doing a global update, like enabling all recs or disabling them. + * It is not set when just updating a single ftrace_ops. + */ +static bool update_all_ops; + #ifndef CONFIG_FTRACE_MCOUNT_RECORD # error Dynamic ftrace depends on MCOUNT_RECORD #endif @@ -1096,6 +1117,43 @@ static struct ftrace_ops global_ops = { FTRACE_OPS_FL_INITIALIZED, }; +/* + * This is used by __kernel_text_address() to return true if the + * address is on a dynamically allocated trampoline that would + * not return true for either core_kernel_text() or + * is_module_text_address(). + */ +bool is_ftrace_trampoline(unsigned long addr) +{ + struct ftrace_ops *op; + bool ret = false; + + /* + * Some of the ops may be dynamically allocated, + * they are freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* + * This is to check for dynamically allocated trampolines. + * Trampolines that are in kernel text will have + * core_kernel_text() return true. + */ + if (op->trampoline && op->trampoline_size) + if (addr >= op->trampoline && + addr < op->trampoline + op->trampoline_size) { + ret = true; + goto out; + } + } while_for_each_ftrace_op(op); + + out: + preempt_enable_notrace(); + + return ret; +} + struct ftrace_page { struct ftrace_page *next; struct dyn_ftrace *records; @@ -1300,6 +1358,9 @@ ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash); static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); +static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, + struct ftrace_hash *new_hash); + static int ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash **dst, struct ftrace_hash *src) @@ -1307,12 +1368,16 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_func_entry *entry; struct hlist_node *tn; struct hlist_head *hhd; - struct ftrace_hash *old_hash; struct ftrace_hash *new_hash; int size = src->count; int bits = 0; + int ret; int i; + /* Reject setting notrace hash on IPMODIFY ftrace_ops */ + if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable) + return -EINVAL; + /* * If the new source is empty, just free dst and assign it * the empty_hash. @@ -1346,21 +1411,44 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, } update: + /* Make sure this can be applied if it is IPMODIFY ftrace_ops */ + if (enable) { + /* IPMODIFY should be updated only when filter_hash updating */ + ret = ftrace_hash_ipmodify_update(ops, new_hash); + if (ret < 0) { + free_ftrace_hash(new_hash); + return ret; + } + } + /* * Remove the current set, update the hash and add * them back. */ ftrace_hash_rec_disable_modify(ops, enable); - old_hash = *dst; rcu_assign_pointer(*dst, new_hash); - free_ftrace_hash_rcu(old_hash); ftrace_hash_rec_enable_modify(ops, enable); return 0; } +static bool hash_contains_ip(unsigned long ip, + struct ftrace_ops_hash *hash) +{ + /* + * The function record is a match if it exists in the filter + * hash and not in the notrace hash. Note, an emty hash is + * considered a match for the filter hash, but an empty + * notrace hash is considered not in the notrace hash. + */ + return (ftrace_hash_empty(hash->filter_hash) || + ftrace_lookup_ip(hash->filter_hash, ip)) && + (ftrace_hash_empty(hash->notrace_hash) || + !ftrace_lookup_ip(hash->notrace_hash, ip)); +} + /* * Test the hashes for this ops to see if we want to call * the ops->func or not. @@ -1376,8 +1464,7 @@ update: static int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_ops_hash hash; int ret; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS @@ -1390,13 +1477,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); - notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); + hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); + hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); - if ((ftrace_hash_empty(filter_hash) || - ftrace_lookup_ip(filter_hash, ip)) && - (ftrace_hash_empty(notrace_hash) || - !ftrace_lookup_ip(notrace_hash, ip))) + if (hash_contains_ip(ip, &hash)) ret = 1; else ret = 0; @@ -1508,46 +1592,6 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) return keep_regs; } -static void ftrace_remove_tramp(struct ftrace_ops *ops, - struct dyn_ftrace *rec) -{ - /* If TRAMP is not set, no ops should have a trampoline for this */ - if (!(rec->flags & FTRACE_FL_TRAMP)) - return; - - rec->flags &= ~FTRACE_FL_TRAMP; - - if ((!ftrace_hash_empty(ops->func_hash->filter_hash) && - !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) || - ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) - return; - /* - * The tramp_hash entry will be removed at time - * of update. - */ - ops->nr_trampolines--; -} - -static void ftrace_clear_tramps(struct dyn_ftrace *rec, struct ftrace_ops *ops) -{ - struct ftrace_ops *op; - - /* If TRAMP is not set, no ops should have a trampoline for this */ - if (!(rec->flags & FTRACE_FL_TRAMP)) - return; - - do_for_each_ftrace_op(op, ftrace_ops_list) { - /* - * This function is called to clear other tramps - * not the one that is being updated. - */ - if (op == ops) - continue; - if (op->nr_trampolines) - ftrace_remove_tramp(op, rec); - } while_for_each_ftrace_op(op); -} - static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1636,18 +1680,16 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * function, and the ops has a trampoline registered * for it, then we can call it directly. */ - if (ftrace_rec_count(rec) == 1 && ops->trampoline) { + if (ftrace_rec_count(rec) == 1 && ops->trampoline) rec->flags |= FTRACE_FL_TRAMP; - ops->nr_trampolines++; - } else { + else /* * If we are adding another function callback * to this function, and the previous had a * custom trampoline in use, then we need to go * back to the default trampoline. */ - ftrace_clear_tramps(rec, ops); - } + rec->flags &= ~FTRACE_FL_TRAMP; /* * If any ops wants regs saved for this function @@ -1660,9 +1702,6 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, return; rec->flags--; - if (ops->trampoline && !ftrace_rec_count(rec)) - ftrace_remove_tramp(ops, rec); - /* * If the rec had REGS enabled and the ops that is * being removed had REGS set, then see if there is @@ -1677,6 +1716,17 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, } /* + * If the rec had TRAMP enabled, then it needs to + * be cleared. As TRAMP can only be enabled iff + * there is only a single ops attached to it. + * In otherwords, always disable it on decrementing. + * In the future, we may set it if rec count is + * decremented to one, and the ops that is left + * has a trampoline. + */ + rec->flags &= ~FTRACE_FL_TRAMP; + + /* * flags will be cleared in ftrace_check_record() * if rec count is zero. */ @@ -1735,6 +1785,114 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, ftrace_hash_rec_update_modify(ops, filter_hash, 1); } +/* + * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK + * or no-needed to update, -EBUSY if it detects a conflict of the flag + * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs. + * Note that old_hash and new_hash has below meanings + * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected) + * - If the hash is EMPTY_HASH, it hits nothing + * - Anything else hits the recs which match the hash entries. + */ +static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, + struct ftrace_hash *old_hash, + struct ftrace_hash *new_hash) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec, *end = NULL; + int in_old, in_new; + + /* Only update if the ops has been registered */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return 0; + + if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + return 0; + + /* + * Since the IPMODIFY is a very address sensitive action, we do not + * allow ftrace_ops to set all functions to new hash. + */ + if (!new_hash || !old_hash) + return -EINVAL; + + /* Update rec->flags */ + do_for_each_ftrace_rec(pg, rec) { + /* We need to update only differences of filter_hash */ + in_old = !!ftrace_lookup_ip(old_hash, rec->ip); + in_new = !!ftrace_lookup_ip(new_hash, rec->ip); + if (in_old == in_new) + continue; + + if (in_new) { + /* New entries must ensure no others are using it */ + if (rec->flags & FTRACE_FL_IPMODIFY) + goto rollback; + rec->flags |= FTRACE_FL_IPMODIFY; + } else /* Removed entry */ + rec->flags &= ~FTRACE_FL_IPMODIFY; + } while_for_each_ftrace_rec(); + + return 0; + +rollback: + end = rec; + + /* Roll back what we did above */ + do_for_each_ftrace_rec(pg, rec) { + if (rec == end) + goto err_out; + + in_old = !!ftrace_lookup_ip(old_hash, rec->ip); + in_new = !!ftrace_lookup_ip(new_hash, rec->ip); + if (in_old == in_new) + continue; + + if (in_new) + rec->flags &= ~FTRACE_FL_IPMODIFY; + else + rec->flags |= FTRACE_FL_IPMODIFY; + } while_for_each_ftrace_rec(); + +err_out: + return -EBUSY; +} + +static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + if (ftrace_hash_empty(hash)) + hash = NULL; + + return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash); +} + +/* Disabling always succeeds */ +static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + if (ftrace_hash_empty(hash)) + hash = NULL; + + __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH); +} + +static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, + struct ftrace_hash *new_hash) +{ + struct ftrace_hash *old_hash = ops->func_hash->filter_hash; + + if (ftrace_hash_empty(old_hash)) + old_hash = NULL; + + if (ftrace_hash_empty(new_hash)) + new_hash = NULL; + + return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); +} + static void print_ip_ins(const char *fmt, unsigned char *p) { int i; @@ -1745,10 +1903,13 @@ static void print_ip_ins(const char *fmt, unsigned char *p) printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); } +static struct ftrace_ops * +ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); + /** * ftrace_bug - report and shutdown function tracer * @failed: The failed type (EFAULT, EINVAL, EPERM) - * @ip: The address that failed + * @rec: The record that failed * * The arch code that enables or disables the function tracing * can call ftrace_bug() when it has detected a problem in @@ -1757,8 +1918,10 @@ static void print_ip_ins(const char *fmt, unsigned char *p) * EINVAL - if what is read at @ip is not what was expected * EPERM - if the problem happens on writting to the @ip address */ -void ftrace_bug(int failed, unsigned long ip) +void ftrace_bug(int failed, struct dyn_ftrace *rec) { + unsigned long ip = rec ? rec->ip : 0; + switch (failed) { case -EFAULT: FTRACE_WARN_ON_ONCE(1); @@ -1770,7 +1933,7 @@ void ftrace_bug(int failed, unsigned long ip) pr_info("ftrace failed to modify "); print_ip_sym(ip); print_ip_ins(" actual: ", (unsigned char *)ip); - printk(KERN_CONT "\n"); + pr_cont("\n"); break; case -EPERM: FTRACE_WARN_ON_ONCE(1); @@ -1782,6 +1945,24 @@ void ftrace_bug(int failed, unsigned long ip) pr_info("ftrace faulted on unknown error "); print_ip_sym(ip); } + if (rec) { + struct ftrace_ops *ops = NULL; + + pr_info("ftrace record flags: %lx\n", rec->flags); + pr_cont(" (%ld)%s", ftrace_rec_count(rec), + rec->flags & FTRACE_FL_REGS ? " R" : " "); + if (rec->flags & FTRACE_FL_TRAMP_EN) { + ops = ftrace_find_tramp_ops_any(rec); + if (ops) + pr_cont("\ttramp: %pS", + (void *)ops->trampoline); + else + pr_cont("\ttramp: ERROR!"); + + } + ip = ftrace_get_addr_curr(rec); + pr_cont(" expected tramp: %lx\n", ip); + } } static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) @@ -1895,21 +2076,86 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) } static struct ftrace_ops * +ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (!op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + +static struct ftrace_ops * ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) { struct ftrace_ops *op; + unsigned long ip = rec->ip; - /* Removed ops need to be tested first */ - if (removed_ops && removed_ops->tramp_hash) { - if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip)) + /* + * Need to check removed ops first. + * If they are being removed, and this rec has a tramp, + * and this rec is in the ops list, then it would be the + * one with the tramp. + */ + if (removed_ops) { + if (hash_contains_ip(ip, &removed_ops->old_hash)) return removed_ops; } + /* + * Need to find the current trampoline for a rec. + * Now, a trampoline is only attached to a rec if there + * was a single 'ops' attached to it. But this can be called + * when we are adding another op to the rec or removing the + * current one. Thus, if the op is being added, we can + * ignore it because it hasn't attached itself to the rec + * yet. + * + * If an ops is being modified (hooking to different functions) + * then we don't care about the new functions that are being + * added, just the old ones (that are probably being removed). + * + * If we are adding an ops to a function that already is using + * a trampoline, it needs to be removed (trampolines are only + * for single ops connected), then an ops that is not being + * modified also needs to be checked. + */ do_for_each_ftrace_op(op, ftrace_ops_list) { - if (!op->tramp_hash) + + if (!op->trampoline) continue; - if (ftrace_lookup_ip(op->tramp_hash, rec->ip)) + /* + * If the ops is being added, it hasn't gotten to + * the point to be removed from this tree yet. + */ + if (op->flags & FTRACE_OPS_FL_ADDING) + continue; + + + /* + * If the ops is being modified and is in the old + * hash, then it is probably being removed from this + * function. + */ + if ((op->flags & FTRACE_OPS_FL_MODIFYING) && + hash_contains_ip(ip, &op->old_hash)) + return op; + /* + * If the ops is not being added or modified, and it's + * in its normal filter hash, then this must be the one + * we want! + */ + if (!(op->flags & FTRACE_OPS_FL_MODIFYING) && + hash_contains_ip(ip, op->func_hash)) return op; } while_for_each_ftrace_op(op); @@ -1921,10 +2167,11 @@ static struct ftrace_ops * ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) { struct ftrace_ops *op; + unsigned long ip = rec->ip; do_for_each_ftrace_op(op, ftrace_ops_list) { /* pass rec in as regs to have non-NULL val */ - if (ftrace_ops_test(op, rec->ip, rec)) + if (hash_contains_ip(ip, op->func_hash)) return op; } while_for_each_ftrace_op(op); @@ -2038,7 +2285,7 @@ void __weak ftrace_replace_code(int enable) do_for_each_ftrace_rec(pg, rec) { failed = __ftrace_replace_code(rec, enable); if (failed) { - ftrace_bug(failed, rec->ip); + ftrace_bug(failed, rec); /* Stop processing */ return; } @@ -2120,17 +2367,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) static int ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) { - unsigned long ip; int ret; - ip = rec->ip; - if (unlikely(ftrace_disabled)) return 0; ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); if (ret) { - ftrace_bug(ret, ip); + ftrace_bug(ret, rec); return 0; } return 1; @@ -2231,92 +2475,6 @@ void __weak arch_ftrace_update_code(int command) ftrace_run_stop_machine(command); } -static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops) -{ - struct ftrace_page *pg; - struct dyn_ftrace *rec; - int size, bits; - int ret; - - size = ops->nr_trampolines; - bits = 0; - /* - * Make the hash size about 1/2 the # found - */ - for (size /= 2; size; size >>= 1) - bits++; - - ops->tramp_hash = alloc_ftrace_hash(bits); - /* - * TODO: a failed allocation is going to screw up - * the accounting of what needs to be modified - * and not. For now, we kill ftrace if we fail - * to allocate here. But there are ways around this, - * but that will take a little more work. - */ - if (!ops->tramp_hash) - return -ENOMEM; - - do_for_each_ftrace_rec(pg, rec) { - if (ftrace_rec_count(rec) == 1 && - ftrace_ops_test(ops, rec->ip, rec)) { - - /* - * If another ops adds to a rec, the rec will - * lose its trampoline and never get it back - * until all ops are off of it. - */ - if (!(rec->flags & FTRACE_FL_TRAMP)) - continue; - - /* This record had better have a trampoline */ - if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN))) - return -1; - - ret = add_hash_entry(ops->tramp_hash, rec->ip); - if (ret < 0) - return ret; - } - } while_for_each_ftrace_rec(); - - /* The number of recs in the hash must match nr_trampolines */ - if (FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines)) - pr_warn("count=%ld trampolines=%d\n", - ops->tramp_hash->count, - ops->nr_trampolines); - - return 0; -} - -static int ftrace_save_tramp_hashes(void) -{ - struct ftrace_ops *op; - int ret; - - /* - * Now that any trampoline is being used, we need to save the - * hashes for the ops that have them. This allows the mapping - * back from the record to the ops that has the trampoline to - * know what code is being replaced. Modifying code must always - * verify what it is changing. - */ - do_for_each_ftrace_op(op, ftrace_ops_list) { - - /* The tramp_hash is recreated each time. */ - free_ftrace_hash(op->tramp_hash); - op->tramp_hash = NULL; - - if (op->nr_trampolines) { - ret = ftrace_save_ops_tramp_hash(op); - if (ret) - return ret; - } - - } while_for_each_ftrace_op(op); - - return 0; -} - static void ftrace_run_update_code(int command) { int ret; @@ -2336,14 +2494,25 @@ static void ftrace_run_update_code(int command) ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); +} - ret = ftrace_save_tramp_hashes(); - FTRACE_WARN_ON(ret); +static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, + struct ftrace_hash *old_hash) +{ + ops->flags |= FTRACE_OPS_FL_MODIFYING; + ops->old_hash.filter_hash = old_hash; + ftrace_run_update_code(command); + ops->old_hash.filter_hash = NULL; + ops->flags &= ~FTRACE_OPS_FL_MODIFYING; } static ftrace_func_t saved_ftrace_func; static int ftrace_start_up; +void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) +{ +} + static void control_ops_free(struct ftrace_ops *ops) { free_percpu(ops->disabled); @@ -2362,6 +2531,13 @@ static void ftrace_startup_enable(int command) ftrace_run_update_code(command); } +static void ftrace_startup_all(int command) +{ + update_all_ops = true; + ftrace_startup_enable(command); + update_all_ops = false; +} + static int ftrace_startup(struct ftrace_ops *ops, int command) { int ret; @@ -2376,12 +2552,31 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) ftrace_start_up++; command |= FTRACE_UPDATE_CALLS; - ops->flags |= FTRACE_OPS_FL_ENABLED; + /* + * Note that ftrace probes uses this to start up + * and modify functions it will probe. But we still + * set the ADDING flag for modification, as probes + * do not have trampolines. If they add them in the + * future, then the probes will need to distinguish + * between adding and updating probes. + */ + ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING; + + ret = ftrace_hash_ipmodify_enable(ops); + if (ret < 0) { + /* Rollback registration process */ + __unregister_ftrace_function(ops); + ftrace_start_up--; + ops->flags &= ~FTRACE_OPS_FL_ENABLED; + return ret; + } ftrace_hash_rec_enable(ops, 1); ftrace_startup_enable(command); + ops->flags &= ~FTRACE_OPS_FL_ADDING; + return 0; } @@ -2404,6 +2599,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) */ WARN_ON_ONCE(ftrace_start_up < 0); + /* Disabling ipmodify never fails */ + ftrace_hash_ipmodify_disable(ops); ftrace_hash_rec_disable(ops, 1); ops->flags &= ~FTRACE_OPS_FL_ENABLED; @@ -2431,11 +2628,35 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * If the ops uses a trampoline, then it needs to be * tested first on update. */ + ops->flags |= FTRACE_OPS_FL_REMOVING; removed_ops = ops; + /* The trampoline logic checks the old hashes */ + ops->old_hash.filter_hash = ops->func_hash->filter_hash; + ops->old_hash.notrace_hash = ops->func_hash->notrace_hash; + ftrace_run_update_code(command); + /* + * If there's no more ops registered with ftrace, run a + * sanity check to make sure all rec flags are cleared. + */ + if (ftrace_ops_list == &ftrace_list_end) { + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + do_for_each_ftrace_rec(pg, rec) { + if (FTRACE_WARN_ON_ONCE(rec->flags)) + pr_warn(" %pS flags:%lx\n", + (void *)rec->ip, rec->flags); + } while_for_each_ftrace_rec(); + } + + ops->old_hash.filter_hash = NULL; + ops->old_hash.notrace_hash = NULL; + removed_ops = NULL; + ops->flags &= ~FTRACE_OPS_FL_REMOVING; /* * Dynamic ops may be freed, we must make sure that all @@ -2454,6 +2675,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { schedule_on_each_cpu(ftrace_sync); + arch_ftrace_trampoline_free(ops); + if (ops->flags & FTRACE_OPS_FL_CONTROL) control_ops_free(ops); } @@ -2606,7 +2829,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) if (ftrace_start_up && cnt) { int failed = __ftrace_replace_code(p, 1); if (failed) - ftrace_bug(failed, p->ip); + ftrace_bug(failed, p); } } } @@ -2931,6 +3154,22 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&ftrace_lock); } +void * __weak +arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + return NULL; +} + +static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops, + struct dyn_ftrace *rec) +{ + void *ptr; + + ptr = arch_ftrace_trampoline_func(ops, rec); + if (ptr) + seq_printf(m, " ->%pS", ptr); +} + static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; @@ -2941,9 +3180,9 @@ static int t_show(struct seq_file *m, void *v) if (iter->flags & FTRACE_ITER_PRINTALL) { if (iter->flags & FTRACE_ITER_NOTRACE) - seq_printf(m, "#### no functions disabled ####\n"); + seq_puts(m, "#### no functions disabled ####\n"); else - seq_printf(m, "#### all functions enabled ####\n"); + seq_puts(m, "#### all functions enabled ####\n"); return 0; } @@ -2954,22 +3193,25 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, "%ps", (void *)rec->ip); if (iter->flags & FTRACE_ITER_ENABLED) { - seq_printf(m, " (%ld)%s", + struct ftrace_ops *ops = NULL; + + seq_printf(m, " (%ld)%s%s", ftrace_rec_count(rec), - rec->flags & FTRACE_FL_REGS ? " R" : " "); + rec->flags & FTRACE_FL_REGS ? " R" : " ", + rec->flags & FTRACE_FL_IPMODIFY ? " I" : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { - struct ftrace_ops *ops; - - ops = ftrace_find_tramp_ops_curr(rec); - if (ops && ops->trampoline) + ops = ftrace_find_tramp_ops_any(rec); + if (ops) seq_printf(m, "\ttramp: %pS", (void *)ops->trampoline); else - seq_printf(m, "\ttramp: ERROR!"); + seq_puts(m, "\ttramp: ERROR!"); + } + add_trampoline_func(m, ops, rec); } - seq_printf(m, "\n"); + seq_putc(m, '\n'); return 0; } @@ -3003,9 +3245,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; - if (unlikely(ftrace_disabled)) - return -ENODEV; - iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); if (iter) { iter->pg = ftrace_pages_start; @@ -3340,7 +3579,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly = static int ftrace_probe_registered; -static void __enable_ftrace_function_probe(void) +static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash) { int ret; int i; @@ -3348,7 +3587,8 @@ static void __enable_ftrace_function_probe(void) if (ftrace_probe_registered) { /* still need to update the function call sites */ if (ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, + old_hash); return; } @@ -3399,6 +3639,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, { struct ftrace_func_probe *entry; struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; + struct ftrace_hash *old_hash = *orig_hash; struct ftrace_hash *hash; struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -3417,7 +3658,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, mutex_lock(&trace_probe_ops.func_hash->regex_lock); - hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); if (!hash) { count = -ENOMEM; goto out; @@ -3476,10 +3717,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } while_for_each_ftrace_rec(); ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); - if (ret < 0) - count = ret; - __enable_ftrace_function_probe(); + __enable_ftrace_function_probe(old_hash); + + if (!ret) + free_ftrace_hash_rcu(old_hash); + else + count = ret; out_unlock: mutex_unlock(&ftrace_lock); @@ -3503,6 +3747,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, struct ftrace_func_probe *entry; struct ftrace_func_probe *p; struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; + struct ftrace_hash *old_hash = *orig_hash; struct list_head free_list; struct ftrace_hash *hash; struct hlist_node *tmp; @@ -3510,6 +3755,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int type = MATCH_FULL; int i, len = 0; char *search; + int ret; if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) glob = NULL; @@ -3568,8 +3814,11 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, * Remove after the disable is called. Otherwise, if the last * probe is removed, a null hash means *all enabled*. */ - ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); synchronize_sched(); + if (!ret) + free_ftrace_hash_rcu(old_hash); + list_for_each_entry_safe(entry, p, &free_list, free_list) { list_del(&entry->free_list); ftrace_free_entry(entry); @@ -3756,10 +4005,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return add_hash_entry(hash, ip); } -static void ftrace_ops_update_code(struct ftrace_ops *ops) +static void ftrace_ops_update_code(struct ftrace_ops *ops, + struct ftrace_hash *old_hash) { if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); } static int @@ -3767,6 +4017,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long ip, int remove, int reset, int enable) { struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; struct ftrace_hash *hash; int ret; @@ -3801,10 +4052,12 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, } mutex_lock(&ftrace_lock); + old_hash = *orig_hash; ret = ftrace_hash_move(ops, enable, orig_hash, hash); - if (!ret) - ftrace_ops_update_code(ops); - + if (!ret) { + ftrace_ops_update_code(ops, old_hash); + free_ftrace_hash_rcu(old_hash); + } mutex_unlock(&ftrace_lock); out_regex_unlock: @@ -3944,6 +4197,9 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); +static unsigned long save_global_trampoline; +static unsigned long save_global_flags; + static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); @@ -4013,6 +4269,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; struct trace_parser *parser; int filter_hash; int ret; @@ -4042,11 +4299,13 @@ int ftrace_regex_release(struct inode *inode, struct file *file) orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); + old_hash = *orig_hash; ret = ftrace_hash_move(iter->ops, filter_hash, orig_hash, iter->hash); - if (!ret) - ftrace_ops_update_code(iter->ops); - + if (!ret) { + ftrace_ops_update_code(iter->ops, old_hash); + free_ftrace_hash_rcu(old_hash); + } mutex_unlock(&ftrace_lock); } @@ -4149,9 +4408,9 @@ static int g_show(struct seq_file *m, void *v) struct ftrace_graph_data *fgd = m->private; if (fgd->table == ftrace_graph_funcs) - seq_printf(m, "#### all functions enabled ####\n"); + seq_puts(m, "#### all functions enabled ####\n"); else - seq_printf(m, "#### no functions disabled ####\n"); + seq_puts(m, "#### no functions disabled ####\n"); return 0; } @@ -4662,6 +4921,32 @@ void __init ftrace_init(void) ftrace_disabled = 1; } +/* Do nothing if arch does not support this */ +void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) +{ +} + +static void ftrace_update_trampoline(struct ftrace_ops *ops) +{ + +/* + * Currently there's no safe way to free a trampoline when the kernel + * is configured with PREEMPT. That is because a task could be preempted + * when it jumped to the trampoline, it may be preempted for a long time + * depending on the system load, and currently there's no way to know + * when it will be off the trampoline. If the trampoline is freed + * too early, when the task runs again, it will be executing on freed + * memory and crash. + */ +#ifdef CONFIG_PREEMPT + /* Currently, only non dynamic ops can have a trampoline */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) + return; +#endif + + arch_ftrace_update_trampoline(ops); +} + #else static struct ftrace_ops global_ops = { @@ -4678,6 +4963,7 @@ core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } +static inline void ftrace_startup_all(int command) { } /* Keep as macros so we do not need to define the commands */ # define ftrace_startup(ops, command) \ ({ \ @@ -4703,6 +4989,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 1; } +static void ftrace_update_trampoline(struct ftrace_ops *ops) +{ +} + #endif /* CONFIG_DYNAMIC_FTRACE */ __init void ftrace_init_global_array_ops(struct trace_array *tr) @@ -4827,6 +5117,56 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) } #endif +/* + * If there's only one function registered but it does not support + * recursion, this function will be called by the mcount trampoline. + * This function will handle recursion protection. + */ +static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) +{ + int bit; + + bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + if (bit < 0) + return; + + op->func(ip, parent_ip, op, regs); + + trace_clear_recursion(bit); +} + +/** + * ftrace_ops_get_func - get the function a trampoline should call + * @ops: the ops to get the function for + * + * Normally the mcount trampoline will call the ops->func, but there + * are times that it should not. For example, if the ops does not + * have its own recursion protection, then it should call the + * ftrace_ops_recurs_func() instead. + * + * Returns the function that the trampoline should call for @ops. + */ +ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) +{ + /* + * If this is a dynamic ops or we force list func, + * then it needs to call the list anyway. + */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) + return ftrace_ops_list_func; + + /* + * If the func handles its own recursion, call it directly. + * Otherwise call the recursion protected function that + * will call the ftrace ops function. + */ + if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE)) + return ftrace_ops_recurs_func; + + return ops->func; +} + static void clear_ftrace_swapper(void) { struct task_struct *p; @@ -4927,7 +5267,8 @@ static int ftrace_pid_add(int p) set_ftrace_pid_task(pid); ftrace_update_pid_func(); - ftrace_startup_enable(0); + + ftrace_startup_all(0); mutex_unlock(&ftrace_lock); return 0; @@ -4956,7 +5297,7 @@ static void ftrace_pid_reset(void) } ftrace_update_pid_func(); - ftrace_startup_enable(0); + ftrace_startup_all(0); mutex_unlock(&ftrace_lock); } @@ -4989,12 +5330,12 @@ static int fpid_show(struct seq_file *m, void *v) const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); if (v == (void *)1) { - seq_printf(m, "no pid\n"); + seq_puts(m, "no pid\n"); return 0; } if (fpid->pid == ftrace_swapper_pid) - seq_printf(m, "swapper tasks\n"); + seq_puts(m, "swapper tasks\n"); else seq_printf(m, "%u\n", pid_vnr(fpid->pid)); @@ -5207,6 +5548,7 @@ static struct ftrace_ops graph_ops = { FTRACE_OPS_FL_STUB, #ifdef FTRACE_GRAPH_TRAMP_ADDR .trampoline = FTRACE_GRAPH_TRAMP_ADDR, + /* trampoline_size is only needed for dynamically allocated tramps */ #endif ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) }; @@ -5436,7 +5778,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, update_function_graph_func(); ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); - out: mutex_unlock(&ftrace_lock); return ret; @@ -5457,6 +5798,17 @@ void unregister_ftrace_graph(void) unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); +#ifdef CONFIG_DYNAMIC_FTRACE + /* + * Function graph does not allocate the trampoline, but + * other global_ops do. We need to reset the ALLOC_TRAMP flag + * if one was used. + */ + global_ops.trampoline = save_global_trampoline; + if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) + global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; +#endif + out: mutex_unlock(&ftrace_lock); } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2d75c94ae87d..7a4104cb95cb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -34,21 +34,19 @@ static void update_pages_handler(struct work_struct *work); */ int ring_buffer_print_entry_header(struct trace_seq *s) { - int ret; - - ret = trace_seq_puts(s, "# compressed entry header\n"); - ret = trace_seq_puts(s, "\ttype_len : 5 bits\n"); - ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n"); - ret = trace_seq_puts(s, "\tarray : 32 bits\n"); - ret = trace_seq_putc(s, '\n'); - ret = trace_seq_printf(s, "\tpadding : type == %d\n", - RINGBUF_TYPE_PADDING); - ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", - RINGBUF_TYPE_TIME_EXTEND); - ret = trace_seq_printf(s, "\tdata max type_len == %d\n", - RINGBUF_TYPE_DATA_TYPE_LEN_MAX); + trace_seq_puts(s, "# compressed entry header\n"); + trace_seq_puts(s, "\ttype_len : 5 bits\n"); + trace_seq_puts(s, "\ttime_delta : 27 bits\n"); + trace_seq_puts(s, "\tarray : 32 bits\n"); + trace_seq_putc(s, '\n'); + trace_seq_printf(s, "\tpadding : type == %d\n", + RINGBUF_TYPE_PADDING); + trace_seq_printf(s, "\ttime_extend : type == %d\n", + RINGBUF_TYPE_TIME_EXTEND); + trace_seq_printf(s, "\tdata max type_len == %d\n", + RINGBUF_TYPE_DATA_TYPE_LEN_MAX); - return ret; + return !trace_seq_has_overflowed(s); } /* @@ -419,32 +417,31 @@ static inline int test_time_stamp(u64 delta) int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; - int ret; - ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\tsigned:%u;\n", - (unsigned int)sizeof(field.time_stamp), - (unsigned int)is_signed_type(u64)); - - ret = trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit), - (unsigned int)is_signed_type(long)); - - ret = trace_seq_printf(s, "\tfield: int overwrite;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - 1, - (unsigned int)is_signed_type(long)); - - ret = trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE, - (unsigned int)is_signed_type(char)); + trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); - return ret; + trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); + + trace_seq_printf(s, "\tfield: int overwrite;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + 1, + (unsigned int)is_signed_type(long)); + + trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)BUF_PAGE_SIZE, + (unsigned int)is_signed_type(char)); + + return !trace_seq_has_overflowed(s); } struct rb_irq_work { @@ -538,16 +535,18 @@ static void rb_wake_up_waiters(struct irq_work *work) * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on + * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ -int ring_buffer_wait(struct ring_buffer *buffer, int cpu) +int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) { - struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); DEFINE_WAIT(wait); struct rb_irq_work *work; + int ret = 0; /* * Depending on what the caller is waiting for, either any @@ -564,36 +563,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu) } - prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + while (true) { + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); - /* - * The events can happen in critical sections where - * checking a work queue can cause deadlocks. - * After adding a task to the queue, this flag is set - * only to notify events to try to wake up the queue - * using irq_work. - * - * We don't clear it even if the buffer is no longer - * empty. The flag only causes the next event to run - * irq_work to do the work queue wake up. The worse - * that can happen if we race with !trace_empty() is that - * an event will cause an irq_work to try to wake up - * an empty queue. - * - * There's no reason to protect this flag either, as - * the work queue and irq_work logic will do the necessary - * synchronization for the wake ups. The only thing - * that is necessary is that the wake up happens after - * a task has been queued. It's OK for spurious wake ups. - */ - work->waiters_pending = true; + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + work->waiters_pending = true; + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) + break; + + if (cpu != RING_BUFFER_ALL_CPUS && + !ring_buffer_empty_cpu(buffer, cpu)) { + unsigned long flags; + bool pagebusy; + + if (!full) + break; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + if (!pagebusy) + break; + } - if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || - (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) schedule(); + } finish_wait(&work->waiters, &wait); - return 0; + + return ret; } /** diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 0434ff1b808e..3f9e328c30b5 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -205,7 +205,6 @@ static void ring_buffer_consumer(void) break; schedule(); - __set_current_state(TASK_RUNNING); } reader_finish = 0; complete(&read_done); @@ -379,7 +378,6 @@ static int ring_buffer_consumer_thread(void *arg) break; schedule(); - __set_current_state(TASK_RUNNING); } __set_current_state(TASK_RUNNING); @@ -407,7 +405,6 @@ static int ring_buffer_producer_thread(void *arg) trace_printk("Sleeping for 10 secs\n"); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ * SLEEP_TIME); - __set_current_state(TASK_RUNNING); } if (kill_test) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a528392b1f4..2e767972e99c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -63,6 +63,10 @@ static bool __read_mostly tracing_selftest_running; */ bool __read_mostly tracing_selftest_disabled; +/* Pipe tracepoints to printk */ +struct trace_iterator *tracepoint_print_iter; +int tracepoint_printk; + /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { { } @@ -155,10 +159,11 @@ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); static int __init stop_trace_on_warning(char *str) { - __disable_trace_on_warning = 1; + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) + __disable_trace_on_warning = 1; return 1; } -__setup("traceoff_on_warning=", stop_trace_on_warning); +__setup("traceoff_on_warning", stop_trace_on_warning); static int __init boot_alloc_snapshot(char *str) { @@ -192,6 +197,13 @@ static int __init set_trace_boot_clock(char *str) } __setup("trace_clock=", set_trace_boot_clock); +static int __init set_tracepoint_printk(char *str) +{ + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) + tracepoint_printk = 1; + return 1; +} +__setup("tp_printk", set_tracepoint_printk); unsigned long long ns2usecs(cycle_t nsec) { @@ -938,19 +950,20 @@ out: return ret; } +/* TODO add a seq_buf_to_buffer() */ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; - if (s->len <= s->readpos) + if (trace_seq_used(s) <= s->seq.readpos) return -EBUSY; - len = s->len - s->readpos; + len = trace_seq_used(s) - s->seq.readpos; if (cnt > len) cnt = len; - memcpy(buf, s->buffer + s->readpos, cnt); + memcpy(buf, s->buffer + s->seq.readpos, cnt); - s->readpos += cnt; + s->seq.readpos += cnt; return cnt; } @@ -1076,13 +1089,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) } #endif /* CONFIG_TRACER_MAX_TRACE */ -static int wait_on_pipe(struct trace_iterator *iter) +static int wait_on_pipe(struct trace_iterator *iter, bool full) { /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; - return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); + return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file, + full); } #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -2028,7 +2042,7 @@ void trace_printk_init_buffers(void) pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); pr_warning("** **\n"); pr_warning("** This means that this is a DEBUG kernel and it is **\n"); - pr_warning("** unsafe for produciton use. **\n"); + pr_warning("** unsafe for production use. **\n"); pr_warning("** **\n"); pr_warning("** If you see this message and you are not debugging **\n"); pr_warning("** the kernel, report this immediately to your vendor! **\n"); @@ -2157,9 +2171,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, goto out; } - len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); - if (len > TRACE_BUF_SIZE) - goto out; + len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); local_save_flags(flags); size = sizeof(*entry) + len + 1; @@ -2170,8 +2182,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, entry = ring_buffer_event_data(event); entry->ip = ip; - memcpy(&entry->buf, tbuffer, len); - entry->buf[len] = '\0'; + memcpy(&entry->buf, tbuffer, len + 1); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); @@ -2508,14 +2519,14 @@ get_total_entries(struct trace_buffer *buf, static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n"); - seq_puts(m, "# / _-----=> irqs-off \n"); - seq_puts(m, "# | / _----=> need-resched \n"); - seq_puts(m, "# || / _---=> hardirq/softirq \n"); - seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# _------=> CPU# \n" + "# / _-----=> irqs-off \n" + "# | / _----=> need-resched \n" + "# || / _---=> hardirq/softirq \n" + "# ||| / _--=> preempt-depth \n" + "# |||| / delay \n" + "# cmd pid ||||| time | caller \n" + "# \\ / ||||| \\ | / \n"); } static void print_event_info(struct trace_buffer *buf, struct seq_file *m) @@ -2532,20 +2543,20 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m) static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) { print_event_info(buf, m); - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | | |\n"); + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n" + "# | | | | |\n"); } static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) { print_event_info(buf, m); - seq_puts(m, "# _-----=> irqs-off\n"); - seq_puts(m, "# / _----=> need-resched\n"); - seq_puts(m, "# | / _---=> hardirq/softirq\n"); - seq_puts(m, "# || / _--=> preempt-depth\n"); - seq_puts(m, "# ||| / delay\n"); - seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | |||| | |\n"); + seq_puts(m, "# _-----=> irqs-off\n" + "# / _----=> need-resched\n" + "# | / _---=> hardirq/softirq\n" + "# || / _--=> preempt-depth\n" + "# ||| / delay\n" + "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n" + "# | | | |||| | |\n"); } void @@ -2648,24 +2659,21 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - if (iter->iter_flags & TRACE_FILE_LAT_FMT) { - if (!trace_print_lat_context(iter)) - goto partial; - } else { - if (!trace_print_context(iter)) - goto partial; - } + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + trace_print_lat_context(iter); + else + trace_print_context(iter); } + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + if (event) return event->funcs->trace(iter, sym_flags, event); - if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) - goto partial; + trace_seq_printf(s, "Unknown type %d\n", entry->type); - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static enum print_line_t print_raw_fmt(struct trace_iterator *iter) @@ -2676,22 +2684,20 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) entry = iter->ent; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - if (!trace_seq_printf(s, "%d %d %llu ", - entry->pid, iter->cpu, iter->ts)) - goto partial; - } + if (trace_flags & TRACE_ITER_CONTEXT_INFO) + trace_seq_printf(s, "%d %d %llu ", + entry->pid, iter->cpu, iter->ts); + + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; event = ftrace_find_event(entry->type); if (event) return event->funcs->raw(iter, 0, event); - if (!trace_seq_printf(s, "%d ?\n", entry->type)) - goto partial; + trace_seq_printf(s, "%d ?\n", entry->type); - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static enum print_line_t print_hex_fmt(struct trace_iterator *iter) @@ -2704,9 +2710,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - SEQ_PUT_HEX_FIELD_RET(s, entry->pid); - SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); - SEQ_PUT_HEX_FIELD_RET(s, iter->ts); + SEQ_PUT_HEX_FIELD(s, entry->pid); + SEQ_PUT_HEX_FIELD(s, iter->cpu); + SEQ_PUT_HEX_FIELD(s, iter->ts); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; } event = ftrace_find_event(entry->type); @@ -2716,9 +2724,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) return ret; } - SEQ_PUT_FIELD_RET(s, newline); + SEQ_PUT_FIELD(s, newline); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t print_bin_fmt(struct trace_iterator *iter) @@ -2730,9 +2738,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) entry = iter->ent; if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, iter->cpu); - SEQ_PUT_FIELD_RET(s, iter->ts); + SEQ_PUT_FIELD(s, entry->pid); + SEQ_PUT_FIELD(s, iter->cpu); + SEQ_PUT_FIELD(s, iter->ts); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; } event = ftrace_find_event(entry->type); @@ -2778,10 +2788,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; - if (iter->lost_events && - !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", - iter->cpu, iter->lost_events)) - return TRACE_TYPE_PARTIAL_LINE; + if (iter->lost_events) { + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events); + if (trace_seq_has_overflowed(&iter->seq)) + return TRACE_TYPE_PARTIAL_LINE; + } if (iter->trace && iter->trace->print_line) { ret = iter->trace->print_line(iter); @@ -2859,44 +2871,44 @@ static void test_ftrace_alive(struct seq_file *m) { if (!ftrace_is_dead()) return; - seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); - seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); + seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n" + "# MAY BE MISSING FUNCTION EVENTS\n"); } #ifdef CONFIG_TRACER_MAX_TRACE static void show_snapshot_main_help(struct seq_file *m) { - seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); - seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); - seq_printf(m, "# Takes a snapshot of the main buffer.\n"); - seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); - seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); - seq_printf(m, "# is not a '0' or '1')\n"); + seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" + "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer.\n" + "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); } static void show_snapshot_percpu_help(struct seq_file *m) { - seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); + seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP - seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); - seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n"); + seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer for this cpu.\n"); #else - seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); - seq_printf(m, "# Must use main snapshot file to allocate.\n"); + seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" + "# Must use main snapshot file to allocate.\n"); #endif - seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); - seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); - seq_printf(m, "# is not a '0' or '1')\n"); + seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); } static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { if (iter->tr->allocated_snapshot) - seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); + seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); else - seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); + seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); - seq_printf(m, "# Snapshot commands:\n"); + seq_puts(m, "# Snapshot commands:\n"); if (iter->cpu_file == RING_BUFFER_ALL_CPUS) show_snapshot_main_help(m); else @@ -3250,7 +3262,7 @@ static int t_show(struct seq_file *m, void *v) if (!t) return 0; - seq_printf(m, "%s", t->name); + seq_puts(m, t->name); if (t->next) seq_putc(m, ' '); else @@ -4313,6 +4325,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) goto out; } + trace_seq_init(&iter->seq); + /* * We make a copy of the current tracer to avoid concurrent * changes on it while we are reading. @@ -4434,15 +4448,12 @@ static int tracing_wait_pipe(struct file *filp) mutex_unlock(&iter->mutex); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, false); mutex_lock(&iter->mutex); if (ret) return ret; - - if (signal_pending(current)) - return -EINTR; } return 1; @@ -4509,18 +4520,18 @@ waitagain: trace_access_lock(iter->cpu_file); while (trace_find_next_entry_inc(iter) != NULL) { enum print_line_t ret; - int len = iter->seq.len; + int save_len = iter->seq.seq.len; ret = print_trace_line(iter); if (ret == TRACE_TYPE_PARTIAL_LINE) { /* don't print partial lines */ - iter->seq.len = len; + iter->seq.seq.len = save_len; break; } if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(iter); - if (iter->seq.len >= cnt) + if (trace_seq_used(&iter->seq) >= cnt) break; /* @@ -4536,7 +4547,7 @@ waitagain: /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (iter->seq.readpos >= iter->seq.len) + if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq)) trace_seq_init(&iter->seq); /* @@ -4570,20 +4581,33 @@ static size_t tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) { size_t count; + int save_len; int ret; /* Seq buffer is page-sized, exactly what we need. */ for (;;) { - count = iter->seq.len; + save_len = iter->seq.seq.len; ret = print_trace_line(iter); - count = iter->seq.len - count; - if (rem < count) { - rem = 0; - iter->seq.len -= count; + + if (trace_seq_has_overflowed(&iter->seq)) { + iter->seq.seq.len = save_len; break; } + + /* + * This should not be hit, because it should only + * be set if the iter->seq overflowed. But check it + * anyway to be safe. + */ if (ret == TRACE_TYPE_PARTIAL_LINE) { - iter->seq.len -= count; + iter->seq.seq.len = save_len; + break; + } + + count = trace_seq_used(&iter->seq) - save_len; + if (rem < count) { + rem = 0; + iter->seq.seq.len = save_len; break; } @@ -4664,13 +4688,13 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), - iter->seq.len); + trace_seq_used(&iter->seq)); if (ret < 0) { __free_page(spd.pages[i]); break; } spd.partial[i].offset = 0; - spd.partial[i].len = iter->seq.len; + spd.partial[i].len = trace_seq_used(&iter->seq); trace_seq_init(&iter->seq); } @@ -5372,16 +5396,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, goto out_unlock; } mutex_unlock(&trace_types_lock); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, false); mutex_lock(&trace_types_lock); if (ret) { size = ret; goto out_unlock; } - if (signal_pending(current)) { - size = -EINTR; - goto out_unlock; - } goto again; } size = 0; @@ -5500,7 +5520,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, }; struct buffer_ref *ref; int entries, size, i; - ssize_t ret; + ssize_t ret = 0; mutex_lock(&trace_types_lock); @@ -5538,13 +5558,16 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, int r; ref = kzalloc(sizeof(*ref), GFP_KERNEL); - if (!ref) + if (!ref) { + ret = -ENOMEM; break; + } ref->ref = 1; ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (!ref->page) { + ret = -ENOMEM; kfree(ref); break; } @@ -5582,19 +5605,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { + if (ret) + goto out; + if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { ret = -EAGAIN; goto out; } mutex_unlock(&trace_types_lock); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, true); mutex_lock(&trace_types_lock); if (ret) goto out; - if (signal_pending(current)) { - ret = -EINTR; - goto out; - } + goto again; } @@ -5671,7 +5694,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "read events: %ld\n", cnt); - count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + count = simple_read_from_buffer(ubuf, count, ppos, + s->buffer, trace_seq_used(s)); kfree(s); @@ -5752,10 +5776,10 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip, seq_printf(m, "%ps:", (void *)ip); - seq_printf(m, "snapshot"); + seq_puts(m, "snapshot"); if (count == -1) - seq_printf(m, ":unlimited\n"); + seq_puts(m, ":unlimited\n"); else seq_printf(m, ":count=%ld\n", count); @@ -6420,7 +6444,7 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m int ret; /* Paranoid: Make sure the parent is the "instances" directory */ - parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); + parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); if (WARN_ON_ONCE(parent != trace_instance_dir)) return -ENOENT; @@ -6447,7 +6471,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry) int ret; /* Paranoid: Make sure the parent is the "instances" directory */ - parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); + parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); if (WARN_ON_ONCE(parent != trace_instance_dir)) return -ENOENT; @@ -6634,11 +6658,19 @@ void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ - if (s->len >= TRACE_MAX_PRINT) - s->len = TRACE_MAX_PRINT; + if (s->seq.len >= TRACE_MAX_PRINT) + s->seq.len = TRACE_MAX_PRINT; + + /* + * More paranoid code. Although the buffer size is set to + * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just + * an extra layer of protection. + */ + if (WARN_ON_ONCE(s->seq.len >= s->seq.size)) + s->seq.len = s->seq.size - 1; /* should be zero ended, but we are paranoid. */ - s->buffer[s->len] = 0; + s->buffer[s->seq.len] = 0; printk(KERN_TRACE "%s", s->buffer); @@ -6877,6 +6909,19 @@ out: return ret; } +void __init trace_init(void) +{ + if (tracepoint_printk) { + tracepoint_print_iter = + kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); + if (WARN_ON(!tracepoint_print_iter)) + tracepoint_printk = 0; + } + tracer_alloc_buffers(); + init_ftrace_syscalls(); + trace_event_init(); +} + __init static int clear_boot_tracer(void) { /* @@ -6896,6 +6941,5 @@ __init static int clear_boot_tracer(void) return 0; } -early_initcall(tracer_alloc_buffers); fs_initcall(tracer_init_debugfs); late_initcall(clear_boot_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 385391fb1d3b..8de48bac1ce2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -14,6 +14,7 @@ #include <linux/trace_seq.h> #include <linux/ftrace_event.h> #include <linux/compiler.h> +#include <linux/trace_seq.h> #ifdef CONFIG_FTRACE_SYSCALLS #include <asm/unistd.h> /* For NR_SYSCALLS */ @@ -569,15 +570,6 @@ void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); -void tracing_sched_switch_trace(struct trace_array *tr, - struct task_struct *prev, - struct task_struct *next, - unsigned long flags, int pc); - -void tracing_sched_wakeup_trace(struct trace_array *tr, - struct task_struct *wakee, - struct task_struct *cur, - unsigned long flags, int pc); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, @@ -597,9 +589,6 @@ void set_graph_array(struct trace_array *tr); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); -void tracing_sched_switch_assign_trace(struct trace_array *tr); -void tracing_stop_sched_switch_record(void); -void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); int is_tracing_stopped(void); @@ -719,6 +708,8 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); extern unsigned long trace_flags; +extern char trace_find_mark(unsigned long long duration); + /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -737,7 +728,7 @@ extern unsigned long trace_flags; extern enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags); extern void print_graph_headers_flags(struct seq_file *s, u32 flags); -extern enum print_line_t +extern void trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); extern void graph_trace_open(struct trace_iterator *iter); extern void graph_trace_close(struct trace_iterator *iter); @@ -1310,4 +1301,18 @@ int perf_ftrace_event_register(struct ftrace_event_call *call, #define perf_ftrace_event_register NULL #endif +#ifdef CONFIG_FTRACE_SYSCALLS +void init_ftrace_syscalls(void); +#else +static inline void init_ftrace_syscalls(void) { } +#endif + +#ifdef CONFIG_EVENT_TRACING +void trace_event_init(void); +#else +static inline void __init trace_event_init(void) { } +#endif + +extern struct trace_iterator *tracepoint_print_iter; + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 697fb9bac8f0..7d6e2afde669 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -151,22 +151,21 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", - field->correct ? " ok " : " MISS ", - field->func, - field->file, - field->line)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", + field->correct ? " ok " : " MISS ", + field->func, + field->file, + field->line); + + return trace_handle_return(&iter->seq); } static void branch_print_header(struct seq_file *s) { seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" - " FUNC:FILE:LINE\n"); - seq_puts(s, "# | | | | | " - " |\n"); + " FUNC:FILE:LINE\n" + "# | | | | | " + " |\n"); } static struct trace_event_functions trace_branch_funcs = { @@ -233,12 +232,12 @@ extern unsigned long __stop_annotated_branch_profile[]; static int annotated_branch_stat_headers(struct seq_file *m) { - seq_printf(m, " correct incorrect %% "); - seq_printf(m, " Function " - " File Line\n" - " ------- --------- - " - " -------- " - " ---- ----\n"); + seq_puts(m, " correct incorrect % " + " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); return 0; } @@ -274,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v) seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); if (percent < 0) - seq_printf(m, " X "); + seq_puts(m, " X "); else seq_printf(m, "%3ld ", percent); seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); @@ -362,12 +361,12 @@ extern unsigned long __stop_branch_profile[]; static int all_branch_stat_headers(struct seq_file *m) { - seq_printf(m, " miss hit %% "); - seq_printf(m, " Function " - " File Line\n" - " ------- --------- - " - " -------- " - " ---- ----\n"); + seq_puts(m, " miss hit % " + " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); return 0; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ef06ce7e9cf8..366a78a3e61e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -212,8 +212,40 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, } EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); +static DEFINE_SPINLOCK(tracepoint_iter_lock); + +static void output_printk(struct ftrace_event_buffer *fbuffer) +{ + struct ftrace_event_call *event_call; + struct trace_event *event; + unsigned long flags; + struct trace_iterator *iter = tracepoint_print_iter; + + if (!iter) + return; + + event_call = fbuffer->ftrace_file->event_call; + if (!event_call || !event_call->event.funcs || + !event_call->event.funcs->trace) + return; + + event = &fbuffer->ftrace_file->event_call->event; + + spin_lock_irqsave(&tracepoint_iter_lock, flags); + trace_seq_init(&iter->seq); + iter->ent = fbuffer->entry; + event_call->event.funcs->trace(iter, 0, event); + trace_seq_putc(&iter->seq, 0); + printk("%s", iter->seq.buffer); + + spin_unlock_irqrestore(&tracepoint_iter_lock, flags); +} + void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) { + if (tracepoint_printk) + output_printk(fbuffer); + event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, fbuffer->event, fbuffer->entry, fbuffer->flags, fbuffer->pc); @@ -461,7 +493,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file) if (dir) { spin_lock(&dir->d_lock); /* probably unneeded */ - list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) { + list_for_each_entry(child, &dir->d_subdirs, d_child) { if (child->d_inode) /* probably unneeded */ child->d_inode->i_private = NULL; } @@ -918,7 +950,7 @@ static int f_show(struct seq_file *m, void *v) case FORMAT_HEADER: seq_printf(m, "name: %s\n", ftrace_event_name(call)); seq_printf(m, "ID: %d\n", call->event.type); - seq_printf(m, "format:\n"); + seq_puts(m, "format:\n"); return 0; case FORMAT_FIELD_SEPERATOR: @@ -1044,7 +1076,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_unlock(&event_mutex); if (file) - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); kfree(s); @@ -1210,7 +1243,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); print_subsystem_event_filter(system, s); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); kfree(s); @@ -1265,7 +1299,8 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) trace_seq_init(s); func(s); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); kfree(s); @@ -1988,7 +2023,7 @@ event_enable_print(struct seq_file *m, unsigned long ip, ftrace_event_name(data->file->event_call)); if (data->count == -1) - seq_printf(m, ":unlimited\n"); + seq_puts(m, ":unlimited\n"); else seq_printf(m, ":count=%ld\n", data->count); @@ -2477,8 +2512,14 @@ static __init int event_trace_init(void) #endif return 0; } -early_initcall(event_trace_memsetup); -core_initcall(event_trace_enable); + +void __init trace_event_init(void) +{ + event_trace_memsetup(); + init_ftrace_syscalls(); + event_trace_enable(); +} + fs_initcall(event_trace_init); #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -2513,8 +2554,11 @@ static __init int event_test_thread(void *unused) kfree(test_malloc); set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) + while (!kthread_should_stop()) { schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); return 0; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 7a8c1528e141..ced69da0ff55 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -45,6 +45,7 @@ enum filter_op_ids OP_GT, OP_GE, OP_BAND, + OP_NOT, OP_NONE, OP_OPEN_PAREN, }; @@ -67,6 +68,7 @@ static struct filter_op filter_ops[] = { { OP_GT, ">", 5 }, { OP_GE, ">=", 5 }, { OP_BAND, "&", 6 }, + { OP_NOT, "!", 6 }, { OP_NONE, "OP_NONE", 0 }, { OP_OPEN_PAREN, "(", 0 }, }; @@ -85,6 +87,7 @@ enum { FILT_ERR_MISSING_FIELD, FILT_ERR_INVALID_FILTER, FILT_ERR_IP_FIELD_ONLY, + FILT_ERR_ILLEGAL_NOT_OP, }; static char *err_text[] = { @@ -101,6 +104,7 @@ static char *err_text[] = { "Missing field name and/or value", "Meaningless filter expression", "Only 'ip' field is supported for function trace", + "Illegal use of '!'", }; struct opstack_op { @@ -139,6 +143,7 @@ struct pred_stack { int index; }; +/* If not of not match is equal to not of not, then it is a match */ #define DEFINE_COMPARISON_PRED(type) \ static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ @@ -166,7 +171,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \ break; \ } \ \ - return match; \ + return !!match == !pred->not; \ } #define DEFINE_EQUALITY_PRED(size) \ @@ -484,9 +489,10 @@ static int process_ops(struct filter_pred *preds, if (!WARN_ON_ONCE(!pred->fn)) match = pred->fn(pred, rec); if (!!match == type) - return match; + break; } - return match; + /* If not of not match is equal to not of not, then it is a match */ + return !!match == !op->not; } struct filter_match_preds_data { @@ -735,10 +741,10 @@ static int filter_set_pred(struct event_filter *filter, * then this op can be folded. */ if (left->index & FILTER_PRED_FOLD && - (left->op == dest->op || + ((left->op == dest->op && !left->not) || left->left == FILTER_PRED_INVALID) && right->index & FILTER_PRED_FOLD && - (right->op == dest->op || + ((right->op == dest->op && !right->not) || right->left == FILTER_PRED_INVALID)) dest->index |= FILTER_PRED_FOLD; @@ -1028,7 +1034,7 @@ static int init_pred(struct filter_parse_state *ps, } if (pred->op == OP_NE) - pred->not = 1; + pred->not ^= 1; pred->fn = fn; return 0; @@ -1590,6 +1596,17 @@ static int replace_preds(struct ftrace_event_call *call, continue; } + if (elt->op == OP_NOT) { + if (!n_preds || operand1 || operand2) { + parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0); + err = -EINVAL; + goto fail; + } + if (!dry_run) + filter->preds[n_preds - 1].not ^= 1; + continue; + } + if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); err = -ENOSPC; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 4747b476a030..8712df9decb4 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -373,7 +373,7 @@ event_trigger_print(const char *name, struct seq_file *m, { long count = (long)data; - seq_printf(m, "%s", name); + seq_puts(m, name); if (count == -1) seq_puts(m, ":unlimited"); @@ -383,7 +383,7 @@ event_trigger_print(const char *name, struct seq_file *m, if (filter_str) seq_printf(m, " if %s\n", filter_str); else - seq_puts(m, "\n"); + seq_putc(m, '\n'); return 0; } @@ -1105,7 +1105,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, if (data->filter_str) seq_printf(m, " if %s\n", data->filter_str); else - seq_puts(m, "\n"); + seq_putc(m, '\n'); return 0; } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 57f0ec962d2c..fcd41a166405 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -261,37 +261,74 @@ static struct tracer function_trace __tracer_data = }; #ifdef CONFIG_DYNAMIC_FTRACE -static int update_count(void **data) +static void update_traceon_count(void **data, bool on) { - unsigned long *count = (long *)data; + long *count = (long *)data; + long old_count = *count; - if (!*count) - return 0; + /* + * Tracing gets disabled (or enabled) once per count. + * This function can be called at the same time on multiple CPUs. + * It is fine if both disable (or enable) tracing, as disabling + * (or enabling) the second time doesn't do anything as the + * state of the tracer is already disabled (or enabled). + * What needs to be synchronized in this case is that the count + * only gets decremented once, even if the tracer is disabled + * (or enabled) twice, as the second one is really a nop. + * + * The memory barriers guarantee that we only decrement the + * counter once. First the count is read to a local variable + * and a read barrier is used to make sure that it is loaded + * before checking if the tracer is in the state we want. + * If the tracer is not in the state we want, then the count + * is guaranteed to be the old count. + * + * Next the tracer is set to the state we want (disabled or enabled) + * then a write memory barrier is used to make sure that + * the new state is visible before changing the counter by + * one minus the old counter. This guarantees that another CPU + * executing this code will see the new state before seeing + * the new counter value, and would not do anything if the new + * counter is seen. + * + * Note, there is no synchronization between this and a user + * setting the tracing_on file. But we currently don't care + * about that. + */ + if (!old_count) + return; - if (*count != -1) - (*count)--; + /* Make sure we see count before checking tracing state */ + smp_rmb(); - return 1; + if (on == !!tracing_is_on()) + return; + + if (on) + tracing_on(); + else + tracing_off(); + + /* unlimited? */ + if (old_count == -1) + return; + + /* Make sure tracing state is visible before updating count */ + smp_wmb(); + + *count = old_count - 1; } static void ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) { - if (tracing_is_on()) - return; - - if (update_count(data)) - tracing_on(); + update_traceon_count(data, 1); } static void ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) { - if (!tracing_is_on()) - return; - - if (update_count(data)) - tracing_off(); + update_traceon_count(data, 0); } static void @@ -330,11 +367,49 @@ ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) static void ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) { - if (!tracing_is_on()) - return; + long *count = (long *)data; + long old_count; + long new_count; - if (update_count(data)) - trace_dump_stack(STACK_SKIP); + /* + * Stack traces should only execute the number of times the + * user specified in the counter. + */ + do { + + if (!tracing_is_on()) + return; + + old_count = *count; + + if (!old_count) + return; + + /* unlimited? */ + if (old_count == -1) { + trace_dump_stack(STACK_SKIP); + return; + } + + new_count = old_count - 1; + new_count = cmpxchg(count, old_count, new_count); + if (new_count == old_count) + trace_dump_stack(STACK_SKIP); + + } while (new_count != old_count); +} + +static int update_count(void **data) +{ + unsigned long *count = (long *)data; + + if (!*count) + return 0; + + if (*count != -1) + (*count)--; + + return 1; } static void @@ -361,7 +436,7 @@ ftrace_probe_print(const char *name, struct seq_file *m, seq_printf(m, "%ps:%s", (void *)ip, name); if (count == -1) - seq_printf(m, ":unlimited\n"); + seq_puts(m, ":unlimited\n"); else seq_printf(m, ":count=%ld\n", count); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index f0a0c982cde3..ba476009e5de 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -107,7 +107,7 @@ enum { FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, }; -static enum print_line_t +static void print_graph_duration(unsigned long long duration, struct trace_seq *s, u32 flags); @@ -483,33 +483,24 @@ static int graph_trace_update_thresh(struct trace_array *tr) static int max_bytes_for_cpu; -static enum print_line_t -print_graph_cpu(struct trace_seq *s, int cpu) +static void print_graph_cpu(struct trace_seq *s, int cpu) { - int ret; - /* * Start with a space character - to make it stand out * to the right a bit when trace output is pasted into * email: */ - ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); } #define TRACE_GRAPH_PROCINFO_LENGTH 14 -static enum print_line_t -print_graph_proc(struct trace_seq *s, pid_t pid) +static void print_graph_proc(struct trace_seq *s, pid_t pid) { char comm[TASK_COMM_LEN]; /* sign + log10(MAX_INT) + '\0' */ char pid_str[11]; int spaces = 0; - int ret; int len; int i; @@ -524,56 +515,43 @@ print_graph_proc(struct trace_seq *s, pid_t pid) spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; /* First spaces to align center */ - for (i = 0; i < spaces / 2; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + for (i = 0; i < spaces / 2; i++) + trace_seq_putc(s, ' '); - ret = trace_seq_printf(s, "%s-%s", comm, pid_str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s-%s", comm, pid_str); /* Last spaces to align center */ - for (i = 0; i < spaces - (spaces / 2); i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - return TRACE_TYPE_HANDLED; + for (i = 0; i < spaces - (spaces / 2); i++) + trace_seq_putc(s, ' '); } -static enum print_line_t -print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { - if (!trace_seq_putc(s, ' ')) - return 0; - - return trace_print_lat_fmt(s, entry); + trace_seq_putc(s, ' '); + trace_print_lat_fmt(s, entry); } /* If the pid changed since the last trace, output this event */ -static enum print_line_t +static void verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) { pid_t prev_pid; pid_t *last_pid; - int ret; if (!data) - return TRACE_TYPE_HANDLED; + return; last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); if (*last_pid == pid) - return TRACE_TYPE_HANDLED; + return; prev_pid = *last_pid; *last_pid = pid; if (prev_pid == -1) - return TRACE_TYPE_HANDLED; + return; /* * Context-switch trace line: @@ -582,33 +560,12 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) ------------------------------------------ */ - ret = trace_seq_puts(s, - " ------------------------------------------\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_proc(s, prev_pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_puts(s, " => "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_proc(s, pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_puts(s, - "\n ------------------------------------------\n\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_puts(s, " ------------------------------------------\n"); + print_graph_cpu(s, cpu); + print_graph_proc(s, prev_pid); + trace_seq_puts(s, " => "); + print_graph_proc(s, pid); + trace_seq_puts(s, "\n ------------------------------------------\n\n"); } static struct ftrace_graph_ret_entry * @@ -682,175 +639,122 @@ get_return_for_leaf(struct trace_iterator *iter, return next; } -static int print_graph_abs_time(u64 t, struct trace_seq *s) +static void print_graph_abs_time(u64 t, struct trace_seq *s) { unsigned long usecs_rem; usecs_rem = do_div(t, NSEC_PER_SEC); usecs_rem /= 1000; - return trace_seq_printf(s, "%5lu.%06lu | ", - (unsigned long)t, usecs_rem); + trace_seq_printf(s, "%5lu.%06lu | ", + (unsigned long)t, usecs_rem); } -static enum print_line_t +static void print_graph_irq(struct trace_iterator *iter, unsigned long addr, enum trace_type type, int cpu, pid_t pid, u32 flags) { - int ret; struct trace_seq *s = &iter->seq; + struct trace_entry *ent = iter->ent; if (addr < (unsigned long)__irqentry_text_start || addr >= (unsigned long)__irqentry_text_end) - return TRACE_TYPE_UNHANDLED; + return; if (trace_flags & TRACE_ITER_CONTEXT_INFO) { /* Absolute time */ - if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + print_graph_abs_time(iter->ts, s); /* Cpu */ - if (flags & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + if (flags & TRACE_GRAPH_PRINT_CPU) + print_graph_cpu(s, cpu); /* Proc */ if (flags & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_puts(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_proc(s, pid); + trace_seq_puts(s, " | "); } + + /* Latency format */ + if (trace_flags & TRACE_ITER_LATENCY_FMT) + print_graph_lat_fmt(s, ent); } /* No overhead */ - ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); - if (ret != TRACE_TYPE_HANDLED) - return ret; + print_graph_duration(0, s, flags | FLAGS_FILL_START); if (type == TRACE_GRAPH_ENT) - ret = trace_seq_puts(s, "==========>"); + trace_seq_puts(s, "==========>"); else - ret = trace_seq_puts(s, "<=========="); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_duration(0, s, flags | FLAGS_FILL_END); - if (ret != TRACE_TYPE_HANDLED) - return ret; - - ret = trace_seq_putc(s, '\n'); + trace_seq_puts(s, "<=========="); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; + print_graph_duration(0, s, flags | FLAGS_FILL_END); + trace_seq_putc(s, '\n'); } -enum print_line_t +void trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) { unsigned long nsecs_rem = do_div(duration, 1000); /* log10(ULONG_MAX) + '\0' */ - char msecs_str[21]; + char usecs_str[21]; char nsecs_str[5]; - int ret, len; + int len; int i; - sprintf(msecs_str, "%lu", (unsigned long) duration); + sprintf(usecs_str, "%lu", (unsigned long) duration); /* Print msecs */ - ret = trace_seq_printf(s, "%s", msecs_str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s", usecs_str); - len = strlen(msecs_str); + len = strlen(usecs_str); /* Print nsecs (we don't want to exceed 7 numbers) */ if (len < 7) { size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); snprintf(nsecs_str, slen, "%03lu", nsecs_rem); - ret = trace_seq_printf(s, ".%s", nsecs_str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, ".%s", nsecs_str); len += strlen(nsecs_str); } - ret = trace_seq_puts(s, " us "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, " us "); /* Print remaining spaces to fit the row's width */ - for (i = len; i < 7; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - return TRACE_TYPE_HANDLED; + for (i = len; i < 7; i++) + trace_seq_putc(s, ' '); } -static enum print_line_t +static void print_graph_duration(unsigned long long duration, struct trace_seq *s, u32 flags) { - int ret = -1; - if (!(flags & TRACE_GRAPH_PRINT_DURATION) || !(trace_flags & TRACE_ITER_CONTEXT_INFO)) - return TRACE_TYPE_HANDLED; + return; /* No real adata, just filling the column with spaces */ switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { case FLAGS_FILL_FULL: - ret = trace_seq_puts(s, " | "); - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, " | "); + return; case FLAGS_FILL_START: - ret = trace_seq_puts(s, " "); - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, " "); + return; case FLAGS_FILL_END: - ret = trace_seq_puts(s, " |"); - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, " |"); + return; } /* Signal a overhead of time execution to the output */ - if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { - /* Duration exceeded 100 msecs */ - if (duration > 100000ULL) - ret = trace_seq_puts(s, "! "); - /* Duration exceeded 10 msecs */ - else if (duration > 10000ULL) - ret = trace_seq_puts(s, "+ "); - } - - /* - * The -1 means we either did not exceed the duration tresholds - * or we dont want to print out the overhead. Either way we need - * to fill out the space. - */ - if (ret == -1) - ret = trace_seq_puts(s, " "); - - /* Catching here any failure happenned above */ - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_print_graph_duration(duration, s); - if (ret != TRACE_TYPE_HANDLED) - return ret; - - ret = trace_seq_puts(s, "| "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + if (flags & TRACE_GRAPH_PRINT_OVERHEAD) + trace_seq_printf(s, "%c ", trace_find_mark(duration)); + else + trace_seq_puts(s, " "); - return TRACE_TYPE_HANDLED; + trace_print_graph_duration(duration, s); + trace_seq_puts(s, "| "); } /* Case of a leaf function on its call entry */ @@ -864,7 +768,6 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; - int ret; int i; graph_ret = &ret_entry->ret; @@ -890,22 +793,15 @@ print_graph_entry_leaf(struct trace_iterator *iter, } /* Overhead and duration */ - ret = print_graph_duration(duration, s, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_duration(duration, s, flags); /* Function */ - for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); - ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%ps();\n", (void *)call->func); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t @@ -915,7 +811,6 @@ print_graph_entry_nested(struct trace_iterator *iter, { struct ftrace_graph_ent *call = &entry->graph_ent; struct fgraph_data *data = iter->private; - int ret; int i; if (data) { @@ -931,19 +826,15 @@ print_graph_entry_nested(struct trace_iterator *iter, } /* No time */ - ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); - if (ret != TRACE_TYPE_HANDLED) - return ret; + print_graph_duration(0, s, flags | FLAGS_FILL_FULL); /* Function */ - for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); + + trace_seq_printf(s, "%ps() {\n", (void *)call->func); - ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); - if (!ret) + if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; /* @@ -953,62 +844,43 @@ print_graph_entry_nested(struct trace_iterator *iter, return TRACE_TYPE_NO_CONSUME; } -static enum print_line_t +static void print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, int type, unsigned long addr, u32 flags) { struct fgraph_data *data = iter->private; struct trace_entry *ent = iter->ent; int cpu = iter->cpu; - int ret; /* Pid */ - if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + verif_pid(s, ent->pid, cpu, data); - if (type) { + if (type) /* Interrupt */ - ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + print_graph_irq(iter, addr, type, cpu, ent->pid, flags); if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) - return 0; + return; /* Absolute time */ - if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + print_graph_abs_time(iter->ts, s); /* Cpu */ - if (flags & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + if (flags & TRACE_GRAPH_PRINT_CPU) + print_graph_cpu(s, cpu); /* Proc */ if (flags & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, ent->pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_puts(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_proc(s, ent->pid); + trace_seq_puts(s, " | "); } /* Latency format */ - if (trace_flags & TRACE_ITER_LATENCY_FMT) { - ret = print_graph_lat_fmt(s, ent); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + if (trace_flags & TRACE_ITER_LATENCY_FMT) + print_graph_lat_fmt(s, ent); - return 0; + return; } /* @@ -1126,8 +998,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, if (check_irq_entry(iter, flags, call->func, call->depth)) return TRACE_TYPE_HANDLED; - if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags); leaf_ret = get_return_for_leaf(iter, field); if (leaf_ret) @@ -1160,7 +1031,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, pid_t pid = ent->pid; int cpu = iter->cpu; int func_match = 1; - int ret; int i; if (check_irq_return(iter, flags, trace->depth)) @@ -1186,20 +1056,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, } } - if (print_graph_prologue(iter, s, 0, 0, flags)) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_prologue(iter, s, 0, 0, flags); /* Overhead and duration */ - ret = print_graph_duration(duration, s, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_duration(duration, s, flags); /* Closing brace */ - for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); /* * If the return function does not have a matching entry, @@ -1208,30 +1072,20 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, * belongs to, write out the function name. Always do * that if the funcgraph-tail option is enabled. */ - if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { - ret = trace_seq_puts(s, "}\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } else { - ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) + trace_seq_puts(s, "}\n"); + else + trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); /* Overrun */ - if (flags & TRACE_GRAPH_PRINT_OVERRUN) { - ret = trace_seq_printf(s, " (Overruns: %lu)\n", - trace->overrun); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (flags & TRACE_GRAPH_PRINT_OVERRUN) + trace_seq_printf(s, " (Overruns: %lu)\n", + trace->overrun); - ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, - cpu, pid, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, + cpu, pid, flags); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t @@ -1248,26 +1102,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, if (data) depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; - if (print_graph_prologue(iter, s, 0, 0, flags)) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_prologue(iter, s, 0, 0, flags); /* No time */ - ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); - if (ret != TRACE_TYPE_HANDLED) - return ret; + print_graph_duration(0, s, flags | FLAGS_FILL_FULL); /* Indentation */ if (depth > 0) - for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); /* The comment */ - ret = trace_seq_puts(s, "/* "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, "/* "); switch (iter->ent->type) { case TRACE_BPRINT: @@ -1290,17 +1136,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, return ret; } + if (trace_seq_has_overflowed(s)) + goto out; + /* Strip ending newline */ - if (s->buffer[s->len - 1] == '\n') { - s->buffer[s->len - 1] = '\0'; - s->len--; + if (s->buffer[s->seq.len - 1] == '\n') { + s->buffer[s->seq.len - 1] = '\0'; + s->seq.len--; } - ret = trace_seq_puts(s, " */\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_puts(s, " */\n"); + out: + return trace_handle_return(s); } @@ -1407,32 +1254,32 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) print_lat_header(s, flags); /* 1st line */ - seq_printf(s, "#"); + seq_putc(s, '#'); if (flags & TRACE_GRAPH_PRINT_ABS_TIME) - seq_printf(s, " TIME "); + seq_puts(s, " TIME "); if (flags & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, " CPU"); + seq_puts(s, " CPU"); if (flags & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " TASK/PID "); + seq_puts(s, " TASK/PID "); if (lat) - seq_printf(s, "||||"); + seq_puts(s, "||||"); if (flags & TRACE_GRAPH_PRINT_DURATION) - seq_printf(s, " DURATION "); - seq_printf(s, " FUNCTION CALLS\n"); + seq_puts(s, " DURATION "); + seq_puts(s, " FUNCTION CALLS\n"); /* 2nd line */ - seq_printf(s, "#"); + seq_putc(s, '#'); if (flags & TRACE_GRAPH_PRINT_ABS_TIME) - seq_printf(s, " | "); + seq_puts(s, " | "); if (flags & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, " | "); + seq_puts(s, " | "); if (flags & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " | | "); + seq_puts(s, " | | "); if (lat) - seq_printf(s, "||||"); + seq_puts(s, "||||"); if (flags & TRACE_GRAPH_PRINT_DURATION) - seq_printf(s, " | | "); - seq_printf(s, " | | | |\n"); + seq_puts(s, " | | "); + seq_puts(s, " | | | |\n"); } static void print_graph_headers(struct seq_file *s) diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index bd90e1b06088..3ccf5c2c1320 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -20,10 +20,12 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) { /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; + static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; unsigned int old_userobj; int cnt = 0, cpu; trace_init_global_iter(&iter); + iter.buffer_iter = buffer_iter; for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); @@ -57,19 +59,19 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } - if (!trace_empty(&iter)) - trace_find_next_entry_inc(&iter); - while (!trace_empty(&iter)) { + + while (trace_find_next_entry_inc(&iter)) { if (!cnt) kdb_printf("---------------------------------\n"); cnt++; - if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) + if (!skip_lines) { print_trace_line(&iter); - if (!skip_lines) trace_printk_seq(&iter.seq); - else + } else { skip_lines--; + } + if (KDB_FLAG(CMD_INTERRUPT)) goto out; } @@ -86,9 +88,12 @@ out: atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - for_each_tracing_cpu(cpu) - if (iter.buffer_iter[cpu]) + for_each_tracing_cpu(cpu) { + if (iter.buffer_iter[cpu]) { ring_buffer_read_finish(iter.buffer_iter[cpu]); + iter.buffer_iter[cpu] = NULL; + } + } } /* @@ -127,8 +132,8 @@ static int kdb_ftdump(int argc, const char **argv) static __init int kdb_ftrace_register(void) { - kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", - "Dump ftrace log", 0, KDB_REPEAT_NONE); + kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", + "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); return 0; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 282f6e4e5539..5edb518be345 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -826,7 +826,7 @@ static int probes_seq_show(struct seq_file *m, void *v) struct trace_kprobe *tk = v; int i; - seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); + seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p'); seq_printf(m, ":%s/%s", tk->tp.call.class->system, ftrace_event_name(&tk->tp.call)); @@ -840,7 +840,7 @@ static int probes_seq_show(struct seq_file *m, void *v) for (i = 0; i < tk->tp.nr_args; i++) seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); - seq_printf(m, "\n"); + seq_putc(m, '\n'); return 0; } @@ -1024,27 +1024,22 @@ print_kprobe_event(struct trace_iterator *iter, int flags, field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) - goto partial; + trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) - goto partial; + goto out; - if (!trace_seq_puts(s, ")")) - goto partial; + trace_seq_putc(s, ')'); data = (u8 *)&field[1]; for (i = 0; i < tp->nr_args; i++) if (!tp->args[i].type->print(s, tp->args[i].name, data + tp->args[i].offset, field)) - goto partial; - - if (!trace_seq_puts(s, "\n")) - goto partial; + goto out; - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_putc(s, '\n'); + out: + return trace_handle_return(s); } static enum print_line_t @@ -1060,33 +1055,28 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) - goto partial; + trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) - goto partial; + goto out; - if (!trace_seq_puts(s, " <- ")) - goto partial; + trace_seq_puts(s, " <- "); if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) - goto partial; + goto out; - if (!trace_seq_puts(s, ")")) - goto partial; + trace_seq_putc(s, ')'); data = (u8 *)&field[1]; for (i = 0; i < tp->nr_args; i++) if (!tp->args[i].type->print(s, tp->args[i].name, data + tp->args[i].offset, field)) - goto partial; + goto out; - if (!trace_seq_puts(s, "\n")) - goto partial; + trace_seq_putc(s, '\n'); - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; + out: + return trace_handle_return(s); } diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 0abd9b863474..7a9ba62e9fef 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -59,17 +59,15 @@ static void mmio_trace_start(struct trace_array *tr) mmio_reset_data(tr); } -static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) +static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) { - int ret = 0; int i; resource_size_t start, end; const struct pci_driver *drv = pci_dev_driver(dev); - /* XXX: incomplete checks for trace_seq_printf() return value */ - ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", - dev->bus->number, dev->devfn, - dev->vendor, dev->device, dev->irq); + trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", + dev->bus->number, dev->devfn, + dev->vendor, dev->device, dev->irq); /* * XXX: is pci_resource_to_user() appropriate, since we are * supposed to interpret the __ioremap() phys_addr argument based on @@ -77,21 +75,20 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) */ for (i = 0; i < 7; i++) { pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); - ret += trace_seq_printf(s, " %llx", + trace_seq_printf(s, " %llx", (unsigned long long)(start | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); } for (i = 0; i < 7; i++) { pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); - ret += trace_seq_printf(s, " %llx", + trace_seq_printf(s, " %llx", dev->resource[i].start < dev->resource[i].end ? (unsigned long long)(end - start) + 1 : 0); } if (drv) - ret += trace_seq_printf(s, " %s\n", drv->name); + trace_seq_printf(s, " %s\n", drv->name); else - ret += trace_seq_puts(s, " \n"); - return ret; + trace_seq_puts(s, " \n"); } static void destroy_header_iter(struct header_iter *hiter) @@ -179,28 +176,27 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; - int ret = 1; trace_assign_type(field, entry); rw = &field->rw; switch (rw->opcode) { case MMIO_READ: - ret = trace_seq_printf(s, + trace_seq_printf(s, "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", rw->width, secs, usec_rem, rw->map_id, (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_WRITE: - ret = trace_seq_printf(s, + trace_seq_printf(s, "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", rw->width, secs, usec_rem, rw->map_id, (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_UNKNOWN_OP: - ret = trace_seq_printf(s, + trace_seq_printf(s, "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," "%02lx 0x%lx %d\n", secs, usec_rem, rw->map_id, @@ -209,12 +205,11 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) (rw->value >> 0) & 0xff, rw->pc, 0); break; default: - ret = trace_seq_puts(s, "rw what?\n"); + trace_seq_puts(s, "rw what?\n"); break; } - if (ret) - return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE; + + return trace_handle_return(s); } static enum print_line_t mmio_print_map(struct trace_iterator *iter) @@ -226,31 +221,29 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; - int ret; trace_assign_type(field, entry); m = &field->map; switch (m->opcode) { case MMIO_PROBE: - ret = trace_seq_printf(s, + trace_seq_printf(s, "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", secs, usec_rem, m->map_id, (unsigned long long)m->phys, m->virt, m->len, 0UL, 0); break; case MMIO_UNPROBE: - ret = trace_seq_printf(s, + trace_seq_printf(s, "UNMAP %u.%06lu %d 0x%lx %d\n", secs, usec_rem, m->map_id, 0UL, 0); break; default: - ret = trace_seq_puts(s, "map what?\n"); + trace_seq_puts(s, "map what?\n"); break; } - if (ret) - return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE; + + return trace_handle_return(s); } static enum print_line_t mmio_print_mark(struct trace_iterator *iter) @@ -262,14 +255,11 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter) unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; - int ret; /* The trailing newline must be in the message. */ - ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t mmio_print_line(struct trace_iterator *iter) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c6977d5a9b12..b77b9a697619 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -25,15 +25,12 @@ enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; struct bputs_entry *field; - int ret; trace_assign_type(field, entry); - ret = trace_seq_puts(s, field->str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, field->str); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) @@ -41,15 +38,12 @@ enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; struct bprint_entry *field; - int ret; trace_assign_type(field, entry); - ret = trace_seq_bprintf(s, field->fmt, field->buf); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_bprintf(s, field->fmt, field->buf); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) @@ -57,15 +51,12 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; struct print_entry *field; - int ret; trace_assign_type(field, entry); - ret = trace_seq_puts(s, field->buf); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, field->buf); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } const char * @@ -124,7 +115,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, if (ret == (const char *)(trace_seq_buffer_ptr(p))) trace_seq_printf(p, "0x%lx", val); - + trace_seq_putc(p, 0); return ret; @@ -193,7 +184,6 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; struct trace_seq *p = &iter->tmp_seq; struct trace_entry *entry; - int ret; event = container_of(trace_event, struct ftrace_event_call, event); entry = iter->ent; @@ -204,11 +194,9 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, } trace_seq_init(p); - ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event)); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s: ", ftrace_event_name(event)); - return 0; + return trace_handle_return(s); } EXPORT_SYMBOL(ftrace_raw_output_prep); @@ -216,18 +204,11 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name, char *fmt, va_list ap) { struct trace_seq *s = &iter->seq; - int ret; - - ret = trace_seq_printf(s, "%s: ", name); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_vprintf(s, fmt, ap); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s: ", name); + trace_seq_vprintf(s, fmt, ap); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) @@ -260,7 +241,7 @@ static inline const char *kretprobed(const char *name) } #endif /* CONFIG_KRETPROBES */ -static int +static void seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { #ifdef CONFIG_KALLSYMS @@ -271,12 +252,11 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) name = kretprobed(str); - return trace_seq_printf(s, fmt, name); + trace_seq_printf(s, fmt, name); #endif - return 1; } -static int +static void seq_print_sym_offset(struct trace_seq *s, const char *fmt, unsigned long address) { @@ -287,9 +267,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, sprint_symbol(str, address); name = kretprobed(str); - return trace_seq_printf(s, fmt, name); + trace_seq_printf(s, fmt, name); #endif - return 1; } #ifndef CONFIG_64BIT @@ -320,14 +299,14 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, if (file) { ret = trace_seq_path(s, &file->f_path); if (ret) - ret = trace_seq_printf(s, "[+0x%lx]", - ip - vmstart); + trace_seq_printf(s, "[+0x%lx]", + ip - vmstart); } up_read(&mm->mmap_sem); } if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) - ret = trace_seq_printf(s, " <" IP_FMT ">", ip); - return ret; + trace_seq_printf(s, " <" IP_FMT ">", ip); + return !trace_seq_has_overflowed(s); } int @@ -335,7 +314,6 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, unsigned long sym_flags) { struct mm_struct *mm = NULL; - int ret = 1; unsigned int i; if (trace_flags & TRACE_ITER_SYM_USEROBJ) { @@ -354,48 +332,45 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { unsigned long ip = entry->caller[i]; - if (ip == ULONG_MAX || !ret) + if (ip == ULONG_MAX || trace_seq_has_overflowed(s)) break; - if (ret) - ret = trace_seq_puts(s, " => "); + + trace_seq_puts(s, " => "); + if (!ip) { - if (ret) - ret = trace_seq_puts(s, "??"); - if (ret) - ret = trace_seq_putc(s, '\n'); + trace_seq_puts(s, "??"); + trace_seq_putc(s, '\n'); continue; } - if (!ret) - break; - if (ret) - ret = seq_print_user_ip(s, mm, ip, sym_flags); - ret = trace_seq_putc(s, '\n'); + + seq_print_user_ip(s, mm, ip, sym_flags); + trace_seq_putc(s, '\n'); } if (mm) mmput(mm); - return ret; + + return !trace_seq_has_overflowed(s); } int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) { - int ret; - - if (!ip) - return trace_seq_putc(s, '0'); + if (!ip) { + trace_seq_putc(s, '0'); + goto out; + } if (sym_flags & TRACE_ITER_SYM_OFFSET) - ret = seq_print_sym_offset(s, "%s", ip); + seq_print_sym_offset(s, "%s", ip); else - ret = seq_print_sym_short(s, "%s", ip); - - if (!ret) - return 0; + seq_print_sym_short(s, "%s", ip); if (sym_flags & TRACE_ITER_SYM_ADDR) - ret = trace_seq_printf(s, " <" IP_FMT ">", ip); - return ret; + trace_seq_printf(s, " <" IP_FMT ">", ip); + + out: + return !trace_seq_has_overflowed(s); } /** @@ -413,7 +388,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) char irqs_off; int hardirq; int softirq; - int ret; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; @@ -445,16 +419,15 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) softirq ? 's' : '.'; - if (!trace_seq_printf(s, "%c%c%c", - irqs_off, need_resched, hardsoft_irq)) - return 0; + trace_seq_printf(s, "%c%c%c", + irqs_off, need_resched, hardsoft_irq); if (entry->preempt_count) - ret = trace_seq_printf(s, "%x", entry->preempt_count); + trace_seq_printf(s, "%x", entry->preempt_count); else - ret = trace_seq_putc(s, '.'); + trace_seq_putc(s, '.'); - return ret; + return !trace_seq_has_overflowed(s); } static int @@ -464,14 +437,38 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) trace_find_cmdline(entry->pid, comm); - if (!trace_seq_printf(s, "%8.8s-%-5d %3d", - comm, entry->pid, cpu)) - return 0; + trace_seq_printf(s, "%8.8s-%-5d %3d", + comm, entry->pid, cpu); return trace_print_lat_fmt(s, entry); } -static unsigned long preempt_mark_thresh_us = 100; +#undef MARK +#define MARK(v, s) {.val = v, .sym = s} +/* trace overhead mark */ +static const struct trace_mark { + unsigned long long val; /* unit: nsec */ + char sym; +} mark[] = { + MARK(1000000000ULL , '$'), /* 1 sec */ + MARK(1000000ULL , '#'), /* 1000 usecs */ + MARK(100000ULL , '!'), /* 100 usecs */ + MARK(10000ULL , '+'), /* 10 usecs */ +}; +#undef MARK + +char trace_find_mark(unsigned long long d) +{ + int i; + int size = ARRAY_SIZE(mark); + + for (i = 0; i < size; i++) { + if (d >= mark[i].val) + break; + } + + return (i == size) ? ' ' : mark[i].sym; +} static int lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) @@ -493,24 +490,28 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); unsigned long rel_msec = (unsigned long)rel_ts; - return trace_seq_printf( - s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", - ns2usecs(iter->ts), - abs_msec, abs_usec, - rel_msec, rel_usec); + trace_seq_printf( + s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", + ns2usecs(iter->ts), + abs_msec, abs_usec, + rel_msec, rel_usec); + } else if (verbose && !in_ns) { - return trace_seq_printf( - s, "[%016llx] %lld (+%lld): ", - iter->ts, abs_ts, rel_ts); + trace_seq_printf( + s, "[%016llx] %lld (+%lld): ", + iter->ts, abs_ts, rel_ts); + } else if (!verbose && in_ns) { - return trace_seq_printf( - s, " %4lldus%c: ", - abs_ts, - rel_ts > preempt_mark_thresh_us ? '!' : - rel_ts > 1 ? '+' : ' '); + trace_seq_printf( + s, " %4lldus%c: ", + abs_ts, + trace_find_mark(rel_ts * NSEC_PER_USEC)); + } else { /* !verbose && !in_ns */ - return trace_seq_printf(s, " %4lld: ", abs_ts); + trace_seq_printf(s, " %4lld: ", abs_ts); } + + return !trace_seq_has_overflowed(s); } int trace_print_context(struct trace_iterator *iter) @@ -520,34 +521,29 @@ int trace_print_context(struct trace_iterator *iter) unsigned long long t; unsigned long secs, usec_rem; char comm[TASK_COMM_LEN]; - int ret; trace_find_cmdline(entry->pid, comm); - ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", + trace_seq_printf(s, "%16s-%-5d [%03d] ", comm, entry->pid, iter->cpu); - if (!ret) - return 0; - if (trace_flags & TRACE_ITER_IRQ_INFO) { - ret = trace_print_lat_fmt(s, entry); - if (!ret) - return 0; - } + if (trace_flags & TRACE_ITER_IRQ_INFO) + trace_print_lat_fmt(s, entry); if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { t = ns2usecs(iter->ts); usec_rem = do_div(t, USEC_PER_SEC); secs = (unsigned long)t; - return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); + trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); } else - return trace_seq_printf(s, " %12llu: ", iter->ts); + trace_seq_printf(s, " %12llu: ", iter->ts); + + return !trace_seq_has_overflowed(s); } int trace_print_lat_context(struct trace_iterator *iter) { u64 next_ts; - int ret; /* trace_find_next_entry will reset ent_size */ int ent_size = iter->ent_size; struct trace_seq *s = &iter->seq; @@ -567,18 +563,17 @@ int trace_print_lat_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); - ret = trace_seq_printf( - s, "%16s %5d %3d %d %08x %08lx ", - comm, entry->pid, iter->cpu, entry->flags, - entry->preempt_count, iter->idx); + trace_seq_printf( + s, "%16s %5d %3d %d %08x %08lx ", + comm, entry->pid, iter->cpu, entry->flags, + entry->preempt_count, iter->idx); } else { - ret = lat_print_generic(s, entry, iter->cpu); + lat_print_generic(s, entry, iter->cpu); } - if (ret) - ret = lat_print_timestamp(iter, next_ts); + lat_print_timestamp(iter, next_ts); - return ret; + return !trace_seq_has_overflowed(s); } static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; @@ -692,7 +687,7 @@ int register_ftrace_event(struct trace_event *event) goto out; } else { - + event->type = next_event_type++; list = &ftrace_event_list; } @@ -764,10 +759,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event); enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, struct trace_event *event) { - if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type); - return TRACE_TYPE_HANDLED; + return trace_handle_return(&iter->seq); } /* TRACE_FN */ @@ -779,24 +773,16 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; + seq_print_ip_sym(s, field->ip, flags); if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { - if (!trace_seq_puts(s, " <-")) - goto partial; - if (!seq_print_ip_sym(s, - field->parent_ip, - flags)) - goto partial; + trace_seq_puts(s, " <-"); + seq_print_ip_sym(s, field->parent_ip, flags); } - if (!trace_seq_putc(s, '\n')) - goto partial; - return TRACE_TYPE_HANDLED; + trace_seq_putc(s, '\n'); - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, @@ -806,12 +792,11 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - if (!trace_seq_printf(&iter->seq, "%lx %lx\n", - field->ip, - field->parent_ip)) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(&iter->seq, "%lx %lx\n", + field->ip, + field->parent_ip); - return TRACE_TYPE_HANDLED; + return trace_handle_return(&iter->seq); } static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, @@ -822,10 +807,10 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - SEQ_PUT_HEX_FIELD_RET(s, field->ip); - SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); + SEQ_PUT_HEX_FIELD(s, field->ip); + SEQ_PUT_HEX_FIELD(s, field->parent_ip); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, @@ -836,10 +821,10 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - SEQ_PUT_FIELD_RET(s, field->ip); - SEQ_PUT_FIELD_RET(s, field->parent_ip); + SEQ_PUT_FIELD(s, field->ip); + SEQ_PUT_FIELD(s, field->parent_ip); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static struct trace_event_functions trace_fn_funcs = { @@ -868,18 +853,17 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, T = task_state_char(field->next_state); S = task_state_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); - if (!trace_seq_printf(&iter->seq, - " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", - field->prev_pid, - field->prev_prio, - S, delim, - field->next_cpu, - field->next_pid, - field->next_prio, - T, comm)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_printf(&iter->seq, + " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", + field->prev_pid, + field->prev_prio, + S, delim, + field->next_cpu, + field->next_pid, + field->next_prio, + T, comm); + + return trace_handle_return(&iter->seq); } static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, @@ -904,17 +888,16 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) if (!S) S = task_state_char(field->prev_state); T = task_state_char(field->next_state); - if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", - field->prev_pid, - field->prev_prio, - S, - field->next_cpu, - field->next_pid, - field->next_prio, - T)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", + field->prev_pid, + field->prev_prio, + S, + field->next_cpu, + field->next_pid, + field->next_prio, + T); + + return trace_handle_return(&iter->seq); } static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, @@ -942,15 +925,15 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) S = task_state_char(field->prev_state); T = task_state_char(field->next_state); - SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); - SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); - SEQ_PUT_HEX_FIELD_RET(s, S); - SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); - SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); - SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); - SEQ_PUT_HEX_FIELD_RET(s, T); + SEQ_PUT_HEX_FIELD(s, field->prev_pid); + SEQ_PUT_HEX_FIELD(s, field->prev_prio); + SEQ_PUT_HEX_FIELD(s, S); + SEQ_PUT_HEX_FIELD(s, field->next_cpu); + SEQ_PUT_HEX_FIELD(s, field->next_pid); + SEQ_PUT_HEX_FIELD(s, field->next_prio); + SEQ_PUT_HEX_FIELD(s, T); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, @@ -973,14 +956,15 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - SEQ_PUT_FIELD_RET(s, field->prev_pid); - SEQ_PUT_FIELD_RET(s, field->prev_prio); - SEQ_PUT_FIELD_RET(s, field->prev_state); - SEQ_PUT_FIELD_RET(s, field->next_pid); - SEQ_PUT_FIELD_RET(s, field->next_prio); - SEQ_PUT_FIELD_RET(s, field->next_state); + SEQ_PUT_FIELD(s, field->prev_pid); + SEQ_PUT_FIELD(s, field->prev_prio); + SEQ_PUT_FIELD(s, field->prev_state); + SEQ_PUT_FIELD(s, field->next_cpu); + SEQ_PUT_FIELD(s, field->next_pid); + SEQ_PUT_FIELD(s, field->next_prio); + SEQ_PUT_FIELD(s, field->next_state); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static struct trace_event_functions trace_ctx_funcs = { @@ -1020,23 +1004,19 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); end = (unsigned long *)((long)iter->ent + iter->ent_size); - if (!trace_seq_puts(s, "<stack trace>\n")) - goto partial; + trace_seq_puts(s, "<stack trace>\n"); for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { - if (!trace_seq_puts(s, " => ")) - goto partial; - if (!seq_print_ip_sym(s, *p, flags)) - goto partial; - if (!trace_seq_putc(s, '\n')) - goto partial; - } + if (trace_seq_has_overflowed(s)) + break; - return TRACE_TYPE_HANDLED; + trace_seq_puts(s, " => "); + seq_print_ip_sym(s, *p, flags); + trace_seq_putc(s, '\n'); + } - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static struct trace_event_functions trace_stack_funcs = { @@ -1057,16 +1037,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!trace_seq_puts(s, "<user stack trace>\n")) - goto partial; - - if (!seq_print_userip_objs(field, s, flags)) - goto partial; - - return TRACE_TYPE_HANDLED; + trace_seq_puts(s, "<user stack trace>\n"); + seq_print_userip_objs(field, s, flags); - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static struct trace_event_functions trace_user_stack_funcs = { @@ -1089,19 +1063,11 @@ trace_bputs_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; + seq_print_ip_sym(s, field->ip, flags); + trace_seq_puts(s, ": "); + trace_seq_puts(s, field->str); - if (!trace_seq_puts(s, ": ")) - goto partial; - - if (!trace_seq_puts(s, field->str)) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } @@ -1114,16 +1080,10 @@ trace_bputs_raw(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - if (!trace_seq_printf(s, ": %lx : ", field->ip)) - goto partial; - - if (!trace_seq_puts(s, field->str)) - goto partial; + trace_seq_printf(s, ": %lx : ", field->ip); + trace_seq_puts(s, field->str); - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static struct trace_event_functions trace_bputs_funcs = { @@ -1147,19 +1107,11 @@ trace_bprint_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; - - if (!trace_seq_puts(s, ": ")) - goto partial; - - if (!trace_seq_bprintf(s, field->fmt, field->buf)) - goto partial; + seq_print_ip_sym(s, field->ip, flags); + trace_seq_puts(s, ": "); + trace_seq_bprintf(s, field->fmt, field->buf); - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } @@ -1172,16 +1124,10 @@ trace_bprint_raw(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - if (!trace_seq_printf(s, ": %lx : ", field->ip)) - goto partial; - - if (!trace_seq_bprintf(s, field->fmt, field->buf)) - goto partial; + trace_seq_printf(s, ": %lx : ", field->ip); + trace_seq_bprintf(s, field->fmt, field->buf); - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static struct trace_event_functions trace_bprint_funcs = { @@ -1203,16 +1149,10 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; - - if (!trace_seq_printf(s, ": %s", field->buf)) - goto partial; + seq_print_ip_sym(s, field->ip, flags); + trace_seq_printf(s, ": %s", field->buf); - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, @@ -1222,13 +1162,9 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) - goto partial; - - return TRACE_TYPE_HANDLED; + trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf); - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(&iter->seq); } static struct trace_event_functions trace_print_funcs = { diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 80b25b585a70..8ef2c40efb3c 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -35,17 +35,11 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); extern int __unregister_ftrace_event(struct trace_event *event); extern struct rw_semaphore trace_event_sem; -#define SEQ_PUT_FIELD_RET(s, x) \ -do { \ - if (!trace_seq_putmem(s, &(x), sizeof(x))) \ - return TRACE_TYPE_PARTIAL_LINE; \ -} while (0) - -#define SEQ_PUT_HEX_FIELD_RET(s, x) \ -do { \ - if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ - return TRACE_TYPE_PARTIAL_LINE; \ -} while (0) +#define SEQ_PUT_FIELD(s, x) \ + trace_seq_putmem(s, &(x), sizeof(x)) + +#define SEQ_PUT_HEX_FIELD(s, x) \ + trace_seq_putmem_hex(s, &(x), sizeof(x)) #endif diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2900817ba65c..c4e70b6bd7fa 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -305,7 +305,7 @@ static int t_show(struct seq_file *m, void *v) seq_puts(m, "\\t"); break; case '\\': - seq_puts(m, "\\"); + seq_putc(m, '\\'); break; case '"': seq_puts(m, "\\\""); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index d4b9fc22cd27..b983b2fd2ca1 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -40,7 +40,8 @@ const char *reserved_field_names[] = { int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ void *data, void *ent) \ { \ - return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ + trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ + return !trace_seq_has_overflowed(s); \ } \ const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); @@ -61,10 +62,11 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, int len = *(u32 *)data >> 16; if (!len) - return trace_seq_printf(s, " %s=(fault)", name); + trace_seq_printf(s, " %s=(fault)", name); else - return trace_seq_printf(s, " %s=\"%s\"", name, - (const char *)get_loc_data(data, ent)); + trace_seq_printf(s, " %s=\"%s\"", name, + (const char *)get_loc_data(data, ent)); + return !trace_seq_has_overflowed(s); } NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 3f34dc9b40f3..2e293beb186e 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -14,122 +14,26 @@ #include "trace.h" -static struct trace_array *ctx_trace; -static int __read_mostly tracer_enabled; static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); -static int sched_stopped; - - -void -tracing_sched_switch_trace(struct trace_array *tr, - struct task_struct *prev, - struct task_struct *next, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_context_switch; - struct ring_buffer *buffer = tr->trace_buffer.buffer; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - - event = trace_buffer_lock_reserve(buffer, TRACE_CTX, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = prev->pid; - entry->prev_prio = prev->prio; - entry->prev_state = prev->state; - entry->next_pid = next->pid; - entry->next_prio = next->prio; - entry->next_state = next->state; - entry->next_cpu = task_cpu(next); - - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, flags, pc); -} static void probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) { - struct trace_array_cpu *data; - unsigned long flags; - int cpu; - int pc; - if (unlikely(!sched_ref)) return; tracing_record_cmdline(prev); tracing_record_cmdline(next); - - if (!tracer_enabled || sched_stopped) - return; - - pc = preempt_count(); - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); - - if (likely(!atomic_read(&data->disabled))) - tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); - - local_irq_restore(flags); -} - -void -tracing_sched_wakeup_trace(struct trace_array *tr, - struct task_struct *wakee, - struct task_struct *curr, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_wakeup; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - struct ring_buffer *buffer = tr->trace_buffer.buffer; - - event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = curr->pid; - entry->prev_prio = curr->prio; - entry->prev_state = curr->state; - entry->next_pid = wakee->pid; - entry->next_prio = wakee->prio; - entry->next_state = wakee->state; - entry->next_cpu = task_cpu(wakee); - - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, flags, pc); } static void probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) { - struct trace_array_cpu *data; - unsigned long flags; - int cpu, pc; - if (unlikely(!sched_ref)) return; tracing_record_cmdline(current); - - if (!tracer_enabled || sched_stopped) - return; - - pc = preempt_count(); - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); - - if (likely(!atomic_read(&data->disabled))) - tracing_sched_wakeup_trace(ctx_trace, wakee, current, - flags, pc); - - local_irq_restore(flags); } static int tracing_sched_register(void) @@ -197,51 +101,3 @@ void tracing_stop_cmdline_record(void) { tracing_stop_sched_switch(); } - -/** - * tracing_start_sched_switch_record - start tracing context switches - * - * Turns on context switch tracing for a tracer. - */ -void tracing_start_sched_switch_record(void) -{ - if (unlikely(!ctx_trace)) { - WARN_ON(1); - return; - } - - tracing_start_sched_switch(); - - mutex_lock(&sched_register_mutex); - tracer_enabled++; - mutex_unlock(&sched_register_mutex); -} - -/** - * tracing_stop_sched_switch_record - start tracing context switches - * - * Turns off context switch tracing for a tracer. - */ -void tracing_stop_sched_switch_record(void) -{ - mutex_lock(&sched_register_mutex); - tracer_enabled--; - WARN_ON(tracer_enabled < 0); - mutex_unlock(&sched_register_mutex); - - tracing_stop_sched_switch(); -} - -/** - * tracing_sched_switch_assign_trace - assign a trace array for ctx switch - * @tr: trace array pointer to assign - * - * Some tracers might want to record the context switches in their - * trace. This function lets those tracers assign the trace array - * to use. - */ -void tracing_sched_switch_assign_trace(struct trace_array *tr) -{ - ctx_trace = tr; -} - diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 19bd8928ce94..8fb84b362816 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -365,6 +365,62 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu) wakeup_current_cpu = cpu; } +static void +tracing_sched_switch_trace(struct trace_array *tr, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_context_switch; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_CTX, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = prev->pid; + entry->prev_prio = prev->prio; + entry->prev_state = prev->state; + entry->next_pid = next->pid; + entry->next_prio = next->prio; + entry->next_state = next->state; + entry->next_cpu = task_cpu(next); + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, flags, pc); +} + +static void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct task_struct *wakee, + struct task_struct *curr, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_wakeup; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + + event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = curr->pid; + entry->prev_prio = curr->prio; + entry->prev_state = curr->state; + entry->next_pid = wakee->pid; + entry->next_prio = wakee->prio; + entry->next_state = wakee->state; + entry->next_cpu = task_cpu(wakee); + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, flags, pc); +} + static void notrace probe_wakeup_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 5ef60499dc8e..b0f86ea77881 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -382,6 +382,8 @@ static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* check the trace buffer */ ret = trace_test_buffer(&tr->trace_buffer, &count); + + ftrace_enabled = 1; tracing_start(); /* we should only have one item */ @@ -679,6 +681,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* check the trace buffer */ ret = trace_test_buffer(&tr->trace_buffer, &count); + + ftrace_enabled = 1; trace->reset(tr); tracing_start(); @@ -1025,6 +1029,12 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) #endif #ifdef CONFIG_SCHED_TRACER + +struct wakeup_test_data { + struct completion is_ready; + int go; +}; + static int trace_wakeup_test_thread(void *data) { /* Make this a -deadline thread */ @@ -1034,51 +1044,56 @@ static int trace_wakeup_test_thread(void *data) .sched_deadline = 10000000ULL, .sched_period = 10000000ULL }; - struct completion *x = data; + struct wakeup_test_data *x = data; sched_setattr(current, &attr); /* Make it know we have a new prio */ - complete(x); + complete(&x->is_ready); /* now go to sleep and let the test wake us up */ set_current_state(TASK_INTERRUPTIBLE); - schedule(); + while (!x->go) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } - complete(x); + complete(&x->is_ready); + + set_current_state(TASK_INTERRUPTIBLE); /* we are awake, now wait to disappear */ while (!kthread_should_stop()) { - /* - * This will likely be the system top priority - * task, do short sleeps to let others run. - */ - msleep(100); + schedule(); + set_current_state(TASK_INTERRUPTIBLE); } + __set_current_state(TASK_RUNNING); + return 0; } - int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) { unsigned long save_max = tr->max_latency; struct task_struct *p; - struct completion is_ready; + struct wakeup_test_data data; unsigned long count; int ret; - init_completion(&is_ready); + memset(&data, 0, sizeof(data)); + + init_completion(&data.is_ready); /* create a -deadline thread */ - p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); + p = kthread_run(trace_wakeup_test_thread, &data, "ftrace-test"); if (IS_ERR(p)) { printk(KERN_CONT "Failed to create ftrace wakeup test thread "); return -1; } /* make sure the thread is running at -deadline policy */ - wait_for_completion(&is_ready); + wait_for_completion(&data.is_ready); /* start the tracing */ ret = tracer_init(trace, tr); @@ -1099,18 +1114,20 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) msleep(100); } - init_completion(&is_ready); + init_completion(&data.is_ready); + + data.go = 1; + /* memory barrier is in the wake_up_process() */ wake_up_process(p); /* Wait for the task to wake up */ - wait_for_completion(&is_ready); + wait_for_completion(&data.is_ready); /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(&tr->trace_buffer, NULL); - printk("ret = %d\n", ret); if (!ret) ret = trace_test_buffer(&tr->max_buffer, &count); diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 1f24ed99dca2..f8b45d8792f9 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -27,10 +27,19 @@ #include <linux/trace_seq.h> /* How much buffer is left on the trace_seq? */ -#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len) +#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq) /* How much buffer is written? */ -#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1)) +#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq) + +/* + * trace_seq should work with being initialized with 0s. + */ +static inline void __trace_seq_init(struct trace_seq *s) +{ + if (unlikely(!s->seq.size)) + trace_seq_init(s); +} /** * trace_print_seq - move the contents of trace_seq into a seq_file @@ -43,10 +52,11 @@ */ int trace_print_seq(struct seq_file *m, struct trace_seq *s) { - unsigned int len = TRACE_SEQ_BUF_USED(s); int ret; - ret = seq_write(m, s->buffer, len); + __trace_seq_init(s); + + ret = seq_buf_print_seq(m, &s->seq); /* * Only reset this buffer if we successfully wrote to the @@ -69,34 +79,26 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s) * trace_seq_printf() is used to store strings into a special * buffer (@s). Then the output may be either used by * the sequencer or pulled into another buffer. - * - * Returns 1 if we successfully written all the contents to - * the buffer. - * Returns 0 if we the length to write is bigger than the - * reserved buffer space. In this case, nothing gets written. */ -int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { - unsigned int len = TRACE_SEQ_BUF_LEFT(s); + unsigned int save_len = s->seq.len; va_list ap; - int ret; - if (s->full || !len) - return 0; + if (s->full) + return; + + __trace_seq_init(s); va_start(ap, fmt); - ret = vsnprintf(s->buffer + s->len, len, fmt, ap); + seq_buf_vprintf(&s->seq, fmt, ap); va_end(ap); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; s->full = 1; - return 0; } - - s->len += ret; - - return 1; } EXPORT_SYMBOL_GPL(trace_seq_printf); @@ -107,25 +109,23 @@ EXPORT_SYMBOL_GPL(trace_seq_printf); * @nmaskbits: The number of bits that are valid in @maskp * * Writes a ASCII representation of a bitmask string into @s. - * - * Returns 1 if we successfully written all the contents to - * the buffer. - * Returns 0 if we the length to write is bigger than the - * reserved buffer space. In this case, nothing gets written. */ -int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, +void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, int nmaskbits) { - unsigned int len = TRACE_SEQ_BUF_LEFT(s); - int ret; + unsigned int save_len = s->seq.len; - if (s->full || !len) - return 0; + if (s->full) + return; - ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); - s->len += ret; + __trace_seq_init(s); - return 1; + seq_buf_bitmask(&s->seq, maskp, nmaskbits); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + } } EXPORT_SYMBOL_GPL(trace_seq_bitmask); @@ -139,28 +139,23 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask); * trace_seq_printf is used to store strings into a special * buffer (@s). Then the output may be either used by * the sequencer or pulled into another buffer. - * - * Returns how much it wrote to the buffer. */ -int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) +void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) { - unsigned int len = TRACE_SEQ_BUF_LEFT(s); - int ret; + unsigned int save_len = s->seq.len; - if (s->full || !len) - return 0; + if (s->full) + return; - ret = vsnprintf(s->buffer + s->len, len, fmt, args); + __trace_seq_init(s); + + seq_buf_vprintf(&s->seq, fmt, args); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; s->full = 1; - return 0; } - - s->len += ret; - - return len; } EXPORT_SYMBOL_GPL(trace_seq_vprintf); @@ -178,28 +173,24 @@ EXPORT_SYMBOL_GPL(trace_seq_vprintf); * * This function will take the format and the binary array and finish * the conversion into the ASCII string within the buffer. - * - * Returns how much it wrote to the buffer. */ -int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { - unsigned int len = TRACE_SEQ_BUF_LEFT(s); - int ret; + unsigned int save_len = s->seq.len; - if (s->full || !len) - return 0; + if (s->full) + return; + + __trace_seq_init(s); - ret = bstr_printf(s->buffer + s->len, len, fmt, binary); + seq_buf_bprintf(&s->seq, fmt, binary); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; s->full = 1; - return 0; + return; } - - s->len += ret; - - return len; } EXPORT_SYMBOL_GPL(trace_seq_bprintf); @@ -212,25 +203,22 @@ EXPORT_SYMBOL_GPL(trace_seq_bprintf); * copy to user routines. This function records a simple string * into a special buffer (@s) for later retrieval by a sequencer * or other mechanism. - * - * Returns how much it wrote to the buffer. */ -int trace_seq_puts(struct trace_seq *s, const char *str) +void trace_seq_puts(struct trace_seq *s, const char *str) { unsigned int len = strlen(str); if (s->full) - return 0; + return; + + __trace_seq_init(s); if (len > TRACE_SEQ_BUF_LEFT(s)) { s->full = 1; - return 0; + return; } - memcpy(s->buffer + s->len, str, len); - s->len += len; - - return len; + seq_buf_putmem(&s->seq, str, len); } EXPORT_SYMBOL_GPL(trace_seq_puts); @@ -243,22 +231,20 @@ EXPORT_SYMBOL_GPL(trace_seq_puts); * copy to user routines. This function records a simple charater * into a special buffer (@s) for later retrieval by a sequencer * or other mechanism. - * - * Returns how much it wrote to the buffer. */ -int trace_seq_putc(struct trace_seq *s, unsigned char c) +void trace_seq_putc(struct trace_seq *s, unsigned char c) { if (s->full) - return 0; + return; + + __trace_seq_init(s); if (TRACE_SEQ_BUF_LEFT(s) < 1) { s->full = 1; - return 0; + return; } - s->buffer[s->len++] = c; - - return 1; + seq_buf_putc(&s->seq, c); } EXPORT_SYMBOL_GPL(trace_seq_putc); @@ -271,29 +257,23 @@ EXPORT_SYMBOL_GPL(trace_seq_putc); * There may be cases where raw memory needs to be written into the * buffer and a strcpy() would not work. Using this function allows * for such cases. - * - * Returns how much it wrote to the buffer. */ -int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) +void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) { if (s->full) - return 0; + return; + + __trace_seq_init(s); if (len > TRACE_SEQ_BUF_LEFT(s)) { s->full = 1; - return 0; + return; } - memcpy(s->buffer + s->len, mem, len); - s->len += len; - - return len; + seq_buf_putmem(&s->seq, mem, len); } EXPORT_SYMBOL_GPL(trace_seq_putmem); -#define MAX_MEMHEX_BYTES 8U -#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) - /** * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex * @s: trace sequence descriptor @@ -303,41 +283,31 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem); * This is similar to trace_seq_putmem() except instead of just copying the * raw memory into the buffer it writes its ASCII representation of it * in hex characters. - * - * Returns how much it wrote to the buffer. */ -int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, +void trace_seq_putmem_hex(struct trace_seq *s, const void *mem, unsigned int len) { - unsigned char hex[HEX_CHARS]; - const unsigned char *data = mem; - unsigned int start_len; - int i, j; - int cnt = 0; + unsigned int save_len = s->seq.len; if (s->full) - return 0; + return; - while (len) { - start_len = min(len, HEX_CHARS - 1); -#ifdef __BIG_ENDIAN - for (i = 0, j = 0; i < start_len; i++) { -#else - for (i = start_len-1, j = 0; i >= 0; i--) { -#endif - hex[j++] = hex_asc_hi(data[i]); - hex[j++] = hex_asc_lo(data[i]); - } - if (WARN_ON_ONCE(j == 0 || j/2 > len)) - break; - - /* j increments twice per loop */ - len -= j / 2; - hex[j++] = ' '; - - cnt += trace_seq_putmem(s, hex, j); + __trace_seq_init(s); + + /* Each byte is represented by two chars */ + if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) { + s->full = 1; + return; + } + + /* The added spaces can still cause an overflow */ + seq_buf_putmem_hex(&s->seq, mem, len); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return; } - return cnt; } EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); @@ -355,30 +325,27 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); */ int trace_seq_path(struct trace_seq *s, const struct path *path) { - unsigned char *p; + unsigned int save_len = s->seq.len; if (s->full) return 0; + __trace_seq_init(s); + if (TRACE_SEQ_BUF_LEFT(s) < 1) { s->full = 1; return 0; } - p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); - if (!IS_ERR(p)) { - p = mangle_path(s->buffer + s->len, p, "\n"); - if (p) { - s->len = p - s->buffer; - return 1; - } - } else { - s->buffer[s->len++] = '?'; - return 1; + seq_buf_path(&s->seq, path, "\n"); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return 0; } - s->full = 1; - return 0; + return 1; } EXPORT_SYMBOL_GPL(trace_seq_path); @@ -404,25 +371,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path); */ int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) { - int len; - int ret; - - if (!cnt) - return 0; - - if (s->len <= s->readpos) - return -EBUSY; - - len = s->len - s->readpos; - if (cnt > len) - cnt = len; - ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); - if (ret == cnt) - return -EFAULT; - - cnt -= ret; - - s->readpos += cnt; - return cnt; + __trace_seq_init(s); + return seq_buf_to_user(&s->seq, ubuf, cnt); } EXPORT_SYMBOL_GPL(trace_seq_to_user); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 8a4e5cb66a4c..16eddb308c33 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,7 +13,6 @@ #include <linux/sysctl.h> #include <linux/init.h> #include <linux/fs.h> -#include <linux/magic.h> #include <asm/setup.h> @@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - if ((current != &init_task && - *(end_of_stack(current)) != STACK_END_MAGIC)) { + if (task_stack_end_corrupted(current)) { print_max_stack(); BUG(); } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 759d5e004517..c6ee36fcbf90 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -114,7 +114,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_entry *ent = iter->ent; struct syscall_trace_enter *trace; struct syscall_metadata *entry; - int i, ret, syscall; + int i, syscall; trace = (typeof(trace))ent; syscall = trace->nr; @@ -128,35 +128,28 @@ print_syscall_enter(struct trace_iterator *iter, int flags, goto end; } - ret = trace_seq_printf(s, "%s(", entry->name); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s(", entry->name); for (i = 0; i < entry->nb_args; i++) { + + if (trace_seq_has_overflowed(s)) + goto end; + /* parameter types */ - if (trace_flags & TRACE_ITER_VERBOSE) { - ret = trace_seq_printf(s, "%s ", entry->types[i]); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (trace_flags & TRACE_ITER_VERBOSE) + trace_seq_printf(s, "%s ", entry->types[i]); + /* parameter values */ - ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], - trace->args[i], - i == entry->nb_args - 1 ? "" : ", "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s: %lx%s", entry->args[i], + trace->args[i], + i == entry->nb_args - 1 ? "" : ", "); } - ret = trace_seq_putc(s, ')'); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - + trace_seq_putc(s, ')'); end: - ret = trace_seq_putc(s, '\n'); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_putc(s, '\n'); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t @@ -168,7 +161,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags, struct syscall_trace_exit *trace; int syscall; struct syscall_metadata *entry; - int ret; trace = (typeof(trace))ent; syscall = trace->nr; @@ -176,7 +168,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags, if (!entry) { trace_seq_putc(s, '\n'); - return TRACE_TYPE_HANDLED; + goto out; } if (entry->exit_event->event.type != ent->type) { @@ -184,12 +176,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags, return TRACE_TYPE_UNHANDLED; } - ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, + trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, trace->ret); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; + out: + return trace_handle_return(s); } extern char *__bad_type_size(void); @@ -313,7 +304,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ @@ -360,7 +351,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) int syscall_nr; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ @@ -425,7 +416,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_enter--; - rcu_assign_pointer(tr->enter_syscall_files[num], NULL); + RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL); if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); @@ -463,7 +454,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_exit--; - rcu_assign_pointer(tr->exit_syscall_files[num], NULL); + RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL); if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); @@ -523,7 +514,7 @@ unsigned long __init __weak arch_syscall_addr(int nr) return (unsigned long)sys_call_table[nr]; } -static int __init init_ftrace_syscalls(void) +void __init init_ftrace_syscalls(void) { struct syscall_metadata *meta; unsigned long addr; @@ -533,7 +524,7 @@ static int __init init_ftrace_syscalls(void) GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); - return -ENOMEM; + return; } for (i = 0; i < NR_syscalls; i++) { @@ -545,10 +536,7 @@ static int __init init_ftrace_syscalls(void) meta->syscall_nr = i; syscalls_metadata[i] = meta; } - - return 0; } -early_initcall(init_ftrace_syscalls); #ifdef CONFIG_PERF_EVENTS @@ -567,7 +555,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; @@ -641,7 +629,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 33ff6a24b802..8520acc34b18 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -552,8 +552,7 @@ error: return ret; fail_address_parse: - if (inode) - iput(inode); + iput(inode); pr_info("Failed to parse address or file.\n"); @@ -606,7 +605,7 @@ static int probes_seq_show(struct seq_file *m, void *v) for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); - seq_printf(m, "\n"); + seq_putc(m, '\n'); return 0; } @@ -852,16 +851,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e tu = container_of(event, struct trace_uprobe, tp.call.event); if (is_ret_probe(tu)) { - if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", - ftrace_event_name(&tu->tp.call), - entry->vaddr[1], entry->vaddr[0])) - goto partial; + trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", + ftrace_event_name(&tu->tp.call), + entry->vaddr[1], entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, true); } else { - if (!trace_seq_printf(s, "%s: (0x%lx)", - ftrace_event_name(&tu->tp.call), - entry->vaddr[0])) - goto partial; + trace_seq_printf(s, "%s: (0x%lx)", + ftrace_event_name(&tu->tp.call), + entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, false); } @@ -869,14 +866,13 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e struct probe_arg *parg = &tu->tp.args[i]; if (!parg->type->print(s, parg->name, data + parg->offset, entry)) - goto partial; + goto out; } - if (trace_seq_puts(s, "\n")) - return TRACE_TYPE_HANDLED; + trace_seq_putc(s, '\n'); -partial: - return TRACE_TYPE_PARTIAL_LINE; + out: + return trace_handle_return(s); } typedef bool (*filter_func_t)(struct uprobe_consumer *self, diff --git a/kernel/uid16.c b/kernel/uid16.c index 602e5bbbceff..d58cc4d8f0d1 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!ns_capable(current_user_ns(), CAP_SETGID)) + if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 394f70b17162..9586b670a5b2 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -14,7 +14,7 @@ static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); void user_return_notifier_register(struct user_return_notifier *urn) { set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); - hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list)); + hlist_add_head(&urn->link, this_cpu_ptr(&return_notifier_list)); } EXPORT_SYMBOL_GPL(user_return_notifier_register); @@ -25,7 +25,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register); void user_return_notifier_unregister(struct user_return_notifier *urn) { hlist_del(&urn->link); - if (hlist_empty(&__get_cpu_var(return_notifier_list))) + if (hlist_empty(this_cpu_ptr(&return_notifier_list))) clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); } EXPORT_SYMBOL_GPL(user_return_notifier_unregister); diff --git a/kernel/user.c b/kernel/user.c index 4efa39350e44..b069ccbfb0b0 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -50,7 +50,11 @@ struct user_namespace init_user_ns = { .count = ATOMIC_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, - .proc_inum = PROC_USER_INIT_INO, + .ns.inum = PROC_USER_INIT_INO, +#ifdef CONFIG_USER_NS + .ns.ops = &userns_operations, +#endif + .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_PERSISTENT_KEYRINGS .persistent_keyring_register_sem = __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index aa312b0dc3ec..4109f8320684 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -24,6 +24,7 @@ #include <linux/fs_struct.h> static struct kmem_cache *user_ns_cachep __read_mostly; +static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, @@ -86,11 +87,12 @@ int create_user_ns(struct cred *new) if (!ns) return -ENOMEM; - ret = proc_alloc_inum(&ns->proc_inum); + ret = ns_alloc_inum(&ns->ns); if (ret) { kmem_cache_free(user_ns_cachep, ns); return ret; } + ns->ns.ops = &userns_operations; atomic_set(&ns->count, 1); /* Leave the new->user_ns reference with the new user namespace. */ @@ -99,6 +101,11 @@ int create_user_ns(struct cred *new) ns->owner = owner; ns->group = group; + /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ + mutex_lock(&userns_state_mutex); + ns->flags = parent_ns->flags; + mutex_unlock(&userns_state_mutex); + set_cred_user_ns(new, ns); #ifdef CONFIG_PERSISTENT_KEYRINGS @@ -136,7 +143,7 @@ void free_user_ns(struct user_namespace *ns) #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif - proc_free_inum(ns->proc_inum); + ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); ns = parent; } while (atomic_dec_and_test(&parent->count)); @@ -583,9 +590,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map, return false; } - -static DEFINE_MUTEX(id_map_mutex); - static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, @@ -602,7 +606,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ssize_t ret = -EINVAL; /* - * The id_map_mutex serializes all writes to any given map. + * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * @@ -620,7 +624,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ - mutex_lock(&id_map_mutex); + mutex_lock(&userns_state_mutex); ret = -EPERM; /* Only allow one successful write to the map */ @@ -640,7 +644,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (!page) goto out; - /* Only allow <= page size writes at the beginning of the file */ + /* Only allow < page size writes at the beginning of the file */ ret = -EINVAL; if ((*ppos != 0) || (count >= PAGE_SIZE)) goto out; @@ -750,7 +754,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, *ppos = count; ret = count; out: - mutex_unlock(&id_map_mutex); + mutex_unlock(&userns_state_mutex); if (page) free_page(page); return ret; @@ -812,16 +816,21 @@ static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { - /* Allow mapping to your own filesystem ids */ - if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { + const struct cred *cred = file->f_cred; + /* Don't allow mappings that would allow anything that wouldn't + * be allowed without the establishment of unprivileged mappings. + */ + if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && + uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); - if (uid_eq(uid, file->f_cred->fsuid)) + if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); - if (gid_eq(gid, file->f_cred->fsgid)) + if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && + gid_eq(gid, cred->egid)) return true; } } @@ -841,7 +850,106 @@ static bool new_idmap_permitted(const struct file *file, return false; } -static void *userns_get(struct task_struct *task) +int proc_setgroups_show(struct seq_file *seq, void *v) +{ + struct user_namespace *ns = seq->private; + unsigned long userns_flags = ACCESS_ONCE(ns->flags); + + seq_printf(seq, "%s\n", + (userns_flags & USERNS_SETGROUPS_ALLOWED) ? + "allow" : "deny"); + return 0; +} + +ssize_t proc_setgroups_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + char kbuf[8], *pos; + bool setgroups_allowed; + ssize_t ret; + + /* Only allow a very narrow range of strings to be written */ + ret = -EINVAL; + if ((*ppos != 0) || (count >= sizeof(kbuf))) + goto out; + + /* What was written? */ + ret = -EFAULT; + if (copy_from_user(kbuf, buf, count)) + goto out; + kbuf[count] = '\0'; + pos = kbuf; + + /* What is being requested? */ + ret = -EINVAL; + if (strncmp(pos, "allow", 5) == 0) { + pos += 5; + setgroups_allowed = true; + } + else if (strncmp(pos, "deny", 4) == 0) { + pos += 4; + setgroups_allowed = false; + } + else + goto out; + + /* Verify there is not trailing junk on the line */ + pos = skip_spaces(pos); + if (*pos != '\0') + goto out; + + ret = -EPERM; + mutex_lock(&userns_state_mutex); + if (setgroups_allowed) { + /* Enabling setgroups after setgroups has been disabled + * is not allowed. + */ + if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) + goto out_unlock; + } else { + /* Permanently disabling setgroups after setgroups has + * been enabled by writing the gid_map is not allowed. + */ + if (ns->gid_map.nr_extents != 0) + goto out_unlock; + ns->flags &= ~USERNS_SETGROUPS_ALLOWED; + } + mutex_unlock(&userns_state_mutex); + + /* Report a successful write */ + *ppos = count; + ret = count; +out: + return ret; +out_unlock: + mutex_unlock(&userns_state_mutex); + goto out; +} + +bool userns_may_setgroups(const struct user_namespace *ns) +{ + bool allowed; + + mutex_lock(&userns_state_mutex); + /* It is not safe to use setgroups until a gid mapping in + * the user namespace has been established. + */ + allowed = ns->gid_map.nr_extents != 0; + /* Is setgroups allowed? */ + allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); + mutex_unlock(&userns_state_mutex); + + return allowed; +} + +static inline struct user_namespace *to_user_ns(struct ns_common *ns) +{ + return container_of(ns, struct user_namespace, ns); +} + +static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; @@ -849,17 +957,17 @@ static void *userns_get(struct task_struct *task) user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); - return user_ns; + return user_ns ? &user_ns->ns : NULL; } -static void userns_put(void *ns) +static void userns_put(struct ns_common *ns) { - put_user_ns(ns); + put_user_ns(to_user_ns(ns)); } -static int userns_install(struct nsproxy *nsproxy, void *ns) +static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) { - struct user_namespace *user_ns = ns; + struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering @@ -888,19 +996,12 @@ static int userns_install(struct nsproxy *nsproxy, void *ns) return commit_creds(cred); } -static unsigned int userns_inum(void *ns) -{ - struct user_namespace *user_ns = ns; - return user_ns->proc_inum; -} - const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, - .inum = userns_inum, }; static __init int user_namespaces_init(void) diff --git a/kernel/utsname.c b/kernel/utsname.c index 883aaaa7de8a..831ea7108232 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -42,12 +42,14 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, if (!ns) return ERR_PTR(-ENOMEM); - err = proc_alloc_inum(&ns->proc_inum); + err = ns_alloc_inum(&ns->ns); if (err) { kfree(ns); return ERR_PTR(err); } + ns->ns.ops = &utsns_operations; + down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); @@ -84,11 +86,16 @@ void free_uts_ns(struct kref *kref) ns = container_of(kref, struct uts_namespace, kref); put_user_ns(ns->user_ns); - proc_free_inum(ns->proc_inum); + ns_free_inum(&ns->ns); kfree(ns); } -static void *utsns_get(struct task_struct *task) +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + +static struct ns_common *utsns_get(struct task_struct *task) { struct uts_namespace *ns = NULL; struct nsproxy *nsproxy; @@ -101,17 +108,17 @@ static void *utsns_get(struct task_struct *task) } task_unlock(task); - return ns; + return ns ? &ns->ns : NULL; } -static void utsns_put(void *ns) +static void utsns_put(struct ns_common *ns) { - put_uts_ns(ns); + put_uts_ns(to_uts_ns(ns)); } -static int utsns_install(struct nsproxy *nsproxy, void *new) +static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) { - struct uts_namespace *ns = new; + struct uts_namespace *ns = to_uts_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) @@ -123,18 +130,10 @@ static int utsns_install(struct nsproxy *nsproxy, void *new) return 0; } -static unsigned int utsns_inum(void *vp) -{ - struct uts_namespace *ns = vp; - - return ns->proc_inum; -} - const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, - .inum = utsns_inum, }; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a8d6914030fe..70bf11815f84 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -15,11 +15,6 @@ #include <linux/cpu.h> #include <linux/nmi.h> #include <linux/init.h> -#include <linux/delay.h> -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/lockdep.h> -#include <linux/notifier.h> #include <linux/module.h> #include <linux/sysctl.h> #include <linux/smpboot.h> @@ -47,6 +42,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); +static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); #ifdef CONFIG_HARDLOCKUP_DETECTOR static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); @@ -63,6 +59,25 @@ static unsigned long soft_lockup_nmi_warn; static int hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +static bool hardlockup_detector_enabled = true; +/* + * We may not want to enable hard lockup detection by default in all cases, + * for example when running the kernel as a guest on a hypervisor. In these + * cases this function can be called to disable hard lockup detection. This + * function should only be executed once by the boot processor before the + * kernel command line parameters are parsed, because otherwise it is not + * possible to override this in hardlockup_panic_setup(). + */ +void watchdog_enable_hardlockup_detector(bool val) +{ + hardlockup_detector_enabled = val; +} + +bool watchdog_hardlockup_detector_is_enabled(void) +{ + return hardlockup_detector_enabled; +} + static int __init hardlockup_panic_setup(char *str) { if (!strncmp(str, "panic", 5)) @@ -71,6 +86,14 @@ static int __init hardlockup_panic_setup(char *str) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) watchdog_user_enabled = 0; + else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) { + /* + * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option) + * has the same effect. + */ + watchdog_user_enabled = 1; + watchdog_enable_hardlockup_detector(true); + } return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -185,7 +208,7 @@ void touch_nmi_watchdog(void) * case we shouldn't have to worry about the watchdog * going off. */ - __raw_get_cpu_var(watchdog_nmi_touch) = true; + raw_cpu_write(watchdog_nmi_touch, true); touch_softlockup_watchdog(); } EXPORT_SYMBOL(touch_nmi_watchdog); @@ -194,8 +217,8 @@ EXPORT_SYMBOL(touch_nmi_watchdog); void touch_softlockup_watchdog_sync(void) { - __raw_get_cpu_var(softlockup_touch_sync) = true; - __raw_get_cpu_var(watchdog_touch_ts) = 0; + __this_cpu_write(softlockup_touch_sync, true); + __this_cpu_write(watchdog_touch_ts, 0); } #ifdef CONFIG_HARDLOCKUP_DETECTOR @@ -333,8 +356,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; /* only warn once */ - if (__this_cpu_read(soft_watchdog_warn) == true) + if (__this_cpu_read(soft_watchdog_warn) == true) { + /* + * When multiple processes are causing softlockups the + * softlockup detector only warns on the first one + * because the code relies on a full quiet cycle to + * re-arm. The second process prevents the quiet cycle + * and never gets reported. Use task pointers to detect + * this. + */ + if (__this_cpu_read(softlockup_task_ptr_saved) != + current) { + __this_cpu_write(soft_watchdog_warn, false); + __touch_watchdog(); + } return HRTIMER_RESTART; + } if (softlockup_all_cpu_backtrace) { /* Prevent multiple soft-lockup reports if one cpu is already @@ -350,6 +387,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); + __this_cpu_write(softlockup_task_ptr_saved, current); print_modules(); print_irqtrace_events(current); if (regs) @@ -387,7 +425,7 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) static void watchdog_enable(unsigned int cpu) { - struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); /* kick off the timer for the hardlockup detector */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -407,7 +445,7 @@ static void watchdog_enable(unsigned int cpu) static void watchdog_disable(unsigned int cpu) { - struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); watchdog_set_prio(SCHED_NORMAL, 0); hrtimer_cancel(hrtimer); @@ -454,6 +492,15 @@ static int watchdog_nmi_enable(unsigned int cpu) struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); + /* + * Some kernels need to default hard lockup detection to + * 'disabled', for example a guest on a hypervisor. + */ + if (!watchdog_hardlockup_detector_is_enabled()) { + event = ERR_PTR(-ENOENT); + goto handle_err; + } + /* is it already setup and enabled? */ if (event && event->state > PERF_EVENT_STATE_OFF) goto out; @@ -468,6 +515,7 @@ static int watchdog_nmi_enable(unsigned int cpu) /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); +handle_err: /* save cpu0 error for future comparision */ if (cpu == 0 && IS_ERR(event)) cpu0_err = PTR_ERR(event); @@ -514,7 +562,10 @@ static void watchdog_nmi_disable(unsigned int cpu) /* should be in cleanup, but blocks oprofile */ perf_event_release_kernel(event); } - return; + if (cpu == 0) { + /* watchdog_nmi_enable() expects this to be zero initially. */ + cpu0_err = 0; + } } #else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } @@ -534,7 +585,7 @@ static struct smp_hotplug_thread watchdog_threads = { static void restart_watchdog_hrtimer(void *info) { - struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); int ret; /* @@ -610,11 +661,13 @@ int proc_dowatchdog(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int err, old_thresh, old_enabled; + bool old_hardlockup; static DEFINE_MUTEX(watchdog_proc_mutex); mutex_lock(&watchdog_proc_mutex); old_thresh = ACCESS_ONCE(watchdog_thresh); old_enabled = ACCESS_ONCE(watchdog_user_enabled); + old_hardlockup = watchdog_hardlockup_detector_is_enabled(); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (err || !write) @@ -626,15 +679,22 @@ int proc_dowatchdog(struct ctl_table *table, int write, * disabled. The 'watchdog_running' variable check in * watchdog_*_all_cpus() function takes care of this. */ - if (watchdog_user_enabled && watchdog_thresh) + if (watchdog_user_enabled && watchdog_thresh) { + /* + * Prevent a change in watchdog_thresh accidentally overriding + * the enablement of the hardlockup detector. + */ + if (watchdog_user_enabled != old_enabled) + watchdog_enable_hardlockup_detector(true); err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); - else + } else watchdog_disable_all_cpus(); /* Restore old values on failure */ if (err) { watchdog_thresh = old_thresh; watchdog_user_enabled = old_enabled; + watchdog_enable_hardlockup_detector(old_hardlockup); } out: mutex_unlock(&watchdog_proc_mutex); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5dbe22aa3efd..6202b08f1933 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1804,8 +1804,8 @@ static void pool_mayday_timeout(unsigned long __pool) struct worker_pool *pool = (void *)__pool; struct work_struct *work; - spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */ - spin_lock(&pool->lock); + spin_lock_irq(&pool->lock); + spin_lock(&wq_mayday_lock); /* for wq->maydays */ if (need_to_create_worker(pool)) { /* @@ -1818,8 +1818,8 @@ static void pool_mayday_timeout(unsigned long __pool) send_mayday(work); } - spin_unlock(&pool->lock); - spin_unlock_irq(&wq_mayday_lock); + spin_unlock(&wq_mayday_lock); + spin_unlock_irq(&pool->lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -2043,9 +2043,10 @@ __acquires(&pool->lock) * kernels, where a requeueing work item waiting for something to * happen could deadlock with stop_machine as such work item could * indefinitely requeue itself while all other CPUs are trapped in - * stop_machine. + * stop_machine. At the same time, report a quiescent RCU state so + * the same condition doesn't freeze RCU. */ - cond_resched(); + cond_resched_rcu_qs(); spin_lock_irq(&pool->lock); @@ -2247,12 +2248,30 @@ repeat: * Slurp in all works issued via this workqueue and * process'em. */ - WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); + WARN_ON_ONCE(!list_empty(scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) if (get_work_pwq(work) == pwq) move_linked_works(work, scheduled, &n); - process_scheduled_works(rescuer); + if (!list_empty(scheduled)) { + process_scheduled_works(rescuer); + + /* + * The above execution of rescued work items could + * have created more to rescue through + * pwq_activate_first_delayed() or chained + * queueing. Let's put @pwq back on mayday list so + * that such back-to-back work items, which may be + * being used to relieve memory pressure, don't + * incur MAYDAY_INTERVAL delay inbetween. + */ + if (need_to_create_worker(pool)) { + spin_lock(&wq_mayday_lock); + get_pwq(pwq); + list_move_tail(&pwq->mayday_node, &wq->maydays); + spin_unlock(&wq_mayday_lock); + } + } /* * Put the reference grabbed by send_mayday(). @pool won't |