summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2015-02-10 11:35:36 -0800
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2015-02-10 11:35:36 -0800
commit4ba24fef3eb3b142197135223b90ced2f319cd53 (patch)
treea20c125b27740ec7b4c761b11d801108e1b316b2 /kernel
parent47c1ffb2b6b630894e9a16442611c056ab21c057 (diff)
parent98a4a59ee31a12105a2b84f5b8b515ac2cb208ef (diff)
Merge branch 'next' into for-linus
Prepare first round of input updates for 3.20.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c14
-rw-r--r--kernel/async.c8
-rw-r--r--kernel/audit.c55
-rw-r--r--kernel/audit.h1
-rw-r--r--kernel/audit_tree.c23
-rw-r--r--kernel/audit_watch.c4
-rw-r--r--kernel/auditfilter.c79
-rw-r--r--kernel/auditsc.c86
-rw-r--r--kernel/bpf/Makefile4
-rw-r--r--kernel/bpf/arraymap.c156
-rw-r--r--kernel/bpf/core.c136
-rw-r--r--kernel/bpf/hashtab.c367
-rw-r--r--kernel/bpf/helpers.c89
-rw-r--r--kernel/bpf/syscall.c606
-rw-r--r--kernel/bpf/test_stub.c78
-rw-r--r--kernel/bpf/verifier.c2003
-rw-r--r--kernel/cgroup.c366
-rw-r--r--kernel/configs/tiny.config4
-rw-r--r--kernel/context_tracking.c40
-rw-r--r--kernel/cpu.c37
-rw-r--r--kernel/cpuset.c200
-rw-r--r--kernel/crash_dump.c1
-rw-r--r--kernel/debug/debug_core.c52
-rw-r--r--kernel/debug/kdb/kdb_bp.c43
-rw-r--r--kernel/debug/kdb/kdb_debugger.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c267
-rw-r--r--kernel/debug/kdb/kdb_private.h3
-rw-r--r--kernel/events/callchain.c6
-rw-r--r--kernel/events/core.c347
-rw-r--r--kernel/events/hw_breakpoint.c7
-rw-r--r--kernel/events/uprobes.c9
-rw-r--r--kernel/exit.c317
-rw-r--r--kernel/extable.c7
-rw-r--r--kernel/fork.c25
-rw-r--r--kernel/freezer.c9
-rw-r--r--kernel/futex.c38
-rw-r--r--kernel/gcov/Kconfig5
-rw-r--r--kernel/groups.c11
-rw-r--r--kernel/irq/Kconfig18
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c217
-rw-r--r--kernel/irq/devres.c2
-rw-r--r--kernel/irq/generic-chip.c36
-rw-r--r--kernel/irq/internals.h20
-rw-r--r--kernel/irq/irqdesc.c94
-rw-r--r--kernel/irq/irqdomain.c567
-rw-r--r--kernel/irq/manage.c34
-rw-r--r--kernel/irq/msi.c330
-rw-r--r--kernel/irq/pm.c159
-rw-r--r--kernel/irq/proc.c22
-rw-r--r--kernel/irq_work.c27
-rw-r--r--kernel/kallsyms.c11
-rw-r--r--kernel/kexec.c34
-rw-r--r--kernel/kmod.c111
-rw-r--r--kernel/kprobes.c20
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/locking/locktorture.c529
-rw-r--r--kernel/locking/mcs_spinlock.h3
-rw-r--r--kernel/locking/mutex-debug.c2
-rw-r--r--kernel/locking/mutex.c422
-rw-r--r--kernel/locking/mutex.h2
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/locking/rwsem-xadd.c27
-rw-r--r--kernel/locking/semaphore.c12
-rw-r--r--kernel/module.c206
-rw-r--r--kernel/nsproxy.c10
-rw-r--r--kernel/panic.c14
-rw-r--r--kernel/params.c121
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/pid_namespace.c57
-rw-r--r--kernel/power/Kconfig21
-rw-r--r--kernel/power/hibernate.c22
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/power/process.c58
-rw-r--r--kernel/power/qos.c27
-rw-r--r--kernel/power/snapshot.c63
-rw-r--r--kernel/power/suspend.c51
-rw-r--r--kernel/power/suspend_test.c32
-rw-r--r--kernel/power/swap.c43
-rw-r--r--kernel/printk/printk.c134
-rw-r--r--kernel/ptrace.c23
-rw-r--r--kernel/rcu/Makefile2
-rw-r--r--kernel/rcu/rcu.h2
-rw-r--r--kernel/rcu/rcutorture.c279
-rw-r--r--kernel/rcu/tiny.c26
-rw-r--r--kernel/rcu/tree.c227
-rw-r--r--kernel/rcu/tree.h41
-rw-r--r--kernel/rcu/tree_plugin.h538
-rw-r--r--kernel/rcu/update.c432
-rw-r--r--kernel/reboot.c81
-rw-r--r--kernel/res_counter.c211
-rw-r--r--kernel/resource.c106
-rw-r--r--kernel/sched/auto_group.c5
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c695
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cpudeadline.h3
-rw-r--r--kernel/sched/cpupri.h3
-rw-r--r--kernel/sched/cputime.c64
-rw-r--r--kernel/sched/deadline.c202
-rw-r--r--kernel/sched/debug.c24
-rw-r--r--kernel/sched/fair.c876
-rw-r--r--kernel/sched/idle.c6
-rw-r--r--kernel/sched/idle_task.c5
-rw-r--r--kernel/sched/rt.c42
-rw-r--r--kernel/sched/sched.h129
-rw-r--r--kernel/sched/stop_task.c7
-rw-r--r--kernel/sched/wait.c102
-rw-r--r--kernel/seccomp.c259
-rw-r--r--kernel/signal.c46
-rw-r--r--kernel/smp.c28
-rw-r--r--kernel/smpboot.c15
-rw-r--r--kernel/softirq.c8
-rw-r--r--kernel/stacktrace.c32
-rw-r--r--kernel/sys.c503
-rw-r--r--kernel/sys_ni.c11
-rw-r--r--kernel/sysctl.c35
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/taskstats.c4
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/hrtimer.c23
-rw-r--r--kernel/time/posix-cpu-timers.c16
-rw-r--r--kernel/time/posix-timers.c1
-rw-r--r--kernel/time/test_udelay.c (renamed from kernel/time/udelay_test.c)0
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-common.c7
-rw-r--r--kernel/time/tick-internal.h7
-rw-r--r--kernel/time/tick-oneshot.c2
-rw-r--r--kernel/time/tick-sched.c92
-rw-r--r--kernel/time/time.c21
-rw-r--r--kernel/time/timekeeping.c127
-rw-r--r--kernel/time/timer.c7
-rw-r--r--kernel/torture.c32
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c151
-rw-r--r--kernel/trace/ftrace.c778
-rw-r--r--kernel/trace/ring_buffer.c154
-rw-r--r--kernel/trace/ring_buffer_benchmark.c3
-rw-r--r--kernel/trace/trace.c288
-rw-r--r--kernel/trace/trace.h31
-rw-r--r--kernel/trace/trace_branch.c47
-rw-r--r--kernel/trace/trace_events.c62
-rw-r--r--kernel/trace/trace_events_filter.c29
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--kernel/trace/trace_functions.c119
-rw-r--r--kernel/trace/trace_functions_graph.c423
-rw-r--r--kernel/trace/trace_kdb.c25
-rw-r--r--kernel/trace/trace_kprobe.c46
-rw-r--r--kernel/trace/trace_mmiotrace.c52
-rw-r--r--kernel/trace/trace_output.c446
-rw-r--r--kernel/trace/trace_output.h16
-rw-r--r--kernel/trace/trace_printk.c2
-rw-r--r--kernel/trace/trace_probe.c10
-rw-r--r--kernel/trace/trace_sched_switch.c144
-rw-r--r--kernel/trace/trace_sched_wakeup.c56
-rw-r--r--kernel/trace/trace_selftest.c51
-rw-r--r--kernel/trace/trace_seq.c253
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_syscalls.c66
-rw-r--r--kernel/trace/trace_uprobe.c28
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user-return-notifier.c4
-rw-r--r--kernel/user.c6
-rw-r--r--kernel/user_namespace.c153
-rw-r--r--kernel/utsname.c31
-rw-r--r--kernel/watchdog.c90
-rw-r--r--kernel/workqueue.c35
171 files changed, 12976 insertions, 5148 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index dc5c77544fd6..a59481a3fa6c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
-obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -86,7 +85,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
-obj-$(CONFIG_NET) += bpf/
+obj-$(CONFIG_BPF) += bpf/
obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/acct.c b/kernel/acct.c
index b4c667d22e79..33738ef972f3 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -472,7 +472,6 @@ static void do_acct_process(struct bsd_acct_struct *acct)
acct_t ac;
unsigned long flim;
const struct cred *orig_cred;
- struct pid_namespace *ns = acct->ns;
struct file *file = acct->file;
/*
@@ -500,10 +499,15 @@ static void do_acct_process(struct bsd_acct_struct *acct)
ac.ac_gid16 = ac.ac_gid;
#endif
#if ACCT_VERSION == 3
- ac.ac_pid = task_tgid_nr_ns(current, ns);
- rcu_read_lock();
- ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
- rcu_read_unlock();
+ {
+ struct pid_namespace *ns = acct->ns;
+
+ ac.ac_pid = task_tgid_nr_ns(current, ns);
+ rcu_read_lock();
+ ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
+ ns);
+ rcu_read_unlock();
+ }
#endif
/*
* Get freeze protection. If the fs is frozen, just skip the write
diff --git a/kernel/async.c b/kernel/async.c
index 61f023ce0228..4c3773c0bf63 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work)
/* 1) run (and print duration) */
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
+ pr_debug("calling %lli_%pF @ %i\n",
(long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
@@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work)
if (initcall_debug && system_state == SYSTEM_BOOTING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
- printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
+ pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
(long long)entry->cookie,
entry->func,
(long long)ktime_to_ns(delta) >> 10);
@@ -285,7 +285,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
ktime_t uninitialized_var(starttime), delta, endtime;
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
+ pr_debug("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
@@ -295,7 +295,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);
- printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
+ pr_debug("async_continuing @ %i after %lli usec\n",
task_pid_nr(current),
(long long)ktime_to_ns(delta) >> 10);
}
diff --git a/kernel/audit.c b/kernel/audit.c
index ba2ff5a5c600..72ab759a0b43 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -126,7 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
/* The netlink socket. */
static struct sock *audit_sock;
-int audit_net_id;
+static int audit_net_id;
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -429,7 +429,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
* This function doesn't consume an skb as might be expected since it has to
* copy it anyways.
*/
-static void kauditd_send_multicast_skb(struct sk_buff *skb)
+static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
{
struct sk_buff *copy;
struct audit_net *aunet = net_generic(&init_net, audit_net_id);
@@ -448,11 +448,11 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb)
* no reason for new multicast clients to continue with this
* non-compliance.
*/
- copy = skb_copy(skb, GFP_KERNEL);
+ copy = skb_copy(skb, gfp_mask);
if (!copy)
return;
- nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
+ nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask);
}
/*
@@ -499,7 +499,6 @@ static int kauditd_thread(void *dummy)
set_freezable();
while (!kthread_should_stop()) {
struct sk_buff *skb;
- DECLARE_WAITQUEUE(wait, current);
flush_hold_queue();
@@ -514,16 +513,8 @@ static int kauditd_thread(void *dummy)
audit_printk_skb(skb);
continue;
}
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kauditd_wait, &wait);
- if (!skb_queue_len(&audit_skb_queue)) {
- try_to_freeze();
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&kauditd_wait, &wait);
+ wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
}
return 0;
}
@@ -724,7 +715,7 @@ static int audit_get_feature(struct sk_buff *skb)
seq = nlmsg_hdr(skb)->nlmsg_seq;
- audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af));
+ audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af));
return 0;
}
@@ -739,7 +730,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
audit_log_task_info(ab, current);
- audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
+ audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
audit_feature_names[which], !!old_feature, !!new_feature,
!!old_lock, !!new_lock, res);
audit_log_end(ab);
@@ -750,7 +741,7 @@ static int audit_set_feature(struct sk_buff *skb)
struct audit_features *uaf;
int i;
- BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0]));
+ BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));
uaf = nlmsg_data(nlmsg_hdr(skb));
/* if there is ever a version 2 we should handle that here */
@@ -842,7 +833,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
s.backlog_limit = audit_backlog_limit;
s.lost = atomic_read(&audit_lost);
s.backlog = skb_queue_len(&audit_skb_queue);
- s.version = AUDIT_VERSION_LATEST;
+ s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
s.backlog_wait_time = audit_backlog_wait_time;
audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
@@ -1109,7 +1100,7 @@ static void audit_receive(struct sk_buff *skb)
}
/* Run custom bind function on netlink socket group connect or bind requests. */
-static int audit_bind(int group)
+static int audit_bind(struct net *net, int group)
{
if (!capable(CAP_AUDIT_READ))
return -EPERM;
@@ -1301,19 +1292,9 @@ err:
*/
unsigned int audit_serial(void)
{
- static DEFINE_SPINLOCK(serial_lock);
- static unsigned int serial = 0;
-
- unsigned long flags;
- unsigned int ret;
-
- spin_lock_irqsave(&serial_lock, flags);
- do {
- ret = ++serial;
- } while (unlikely(!ret));
- spin_unlock_irqrestore(&serial_lock, flags);
+ static atomic_t serial = ATOMIC_INIT(0);
- return ret;
+ return atomic_add_return(1, &serial);
}
static inline void audit_get_stamp(struct audit_context *ctx,
@@ -1681,7 +1662,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
}
}
-void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
{
kernel_cap_t *perm = &name->fcap.permitted;
kernel_cap_t *inh = &name->fcap.inheritable;
@@ -1860,7 +1841,7 @@ EXPORT_SYMBOL(audit_log_task_context);
void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
{
const struct cred *cred;
- char name[sizeof(tsk->comm)];
+ char comm[sizeof(tsk->comm)];
struct mm_struct *mm = tsk->mm;
char *tty;
@@ -1894,9 +1875,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
from_kgid(&init_user_ns, cred->fsgid),
tty, audit_get_sessionid(tsk));
- get_task_comm(name, tsk);
audit_log_format(ab, " comm=");
- audit_log_untrustedstring(ab, name);
+ audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
if (mm) {
down_read(&mm->mmap_sem);
@@ -1959,7 +1939,8 @@ void audit_log_end(struct audit_buffer *ab)
} else {
struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- kauditd_send_multicast_skb(ab->skb);
+ nlh->nlmsg_len = ab->skb->len;
+ kauditd_send_multicast_skb(ab->skb, ab->gfp_mask);
/*
* The original kaudit unicast socket sends up messages with
@@ -1970,7 +1951,7 @@ void audit_log_end(struct audit_buffer *ab)
* protocol between the kaudit kernel subsystem and the auditd
* userspace code.
*/
- nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
+ nlh->nlmsg_len -= NLMSG_HDRLEN;
if (audit_pid) {
skb_queue_tail(&audit_skb_queue, ab->skb);
diff --git a/kernel/audit.h b/kernel/audit.h
index 7bb65730c890..3cdffad5a1d9 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -222,7 +222,6 @@ extern void audit_copy_inode(struct audit_names *name,
const struct inode *inode);
extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
kernel_cap_t *cap);
-extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name);
extern void audit_log_name(struct audit_context *context,
struct audit_names *n, struct path *path,
int record_num, int *call_panic);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 135944a7b28a..2e0c97427b33 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -154,6 +154,7 @@ static struct audit_chunk *alloc_chunk(int count)
chunk->owners[i].index = i;
}
fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
+ chunk->mark.mask = FS_IN_IGNORED;
return chunk;
}
@@ -173,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk)
struct fsnotify_mark *entry = &chunk->mark;
struct list_head *list;
- if (!entry->i.inode)
+ if (!entry->inode)
return;
- list = chunk_hash(entry->i.inode);
+ list = chunk_hash(entry->inode);
list_add_rcu(&chunk->hash, list);
}
@@ -187,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
list_for_each_entry_rcu(p, list, hash) {
/* mark.inode may have gone NULL, but who cares? */
- if (p->mark.i.inode == inode) {
+ if (p->mark.inode == inode) {
atomic_long_inc(&p->refs);
return p;
}
@@ -230,7 +231,7 @@ static void untag_chunk(struct node *p)
new = alloc_chunk(size);
spin_lock(&entry->lock);
- if (chunk->dead || !entry->i.inode) {
+ if (chunk->dead || !entry->inode) {
spin_unlock(&entry->lock);
if (new)
free_chunk(new);
@@ -257,7 +258,7 @@ static void untag_chunk(struct node *p)
goto Fallback;
fsnotify_duplicate_mark(&new->mark, entry);
- if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
+ if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) {
fsnotify_put_mark(&new->mark);
goto Fallback;
}
@@ -385,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk_entry = &chunk->mark;
spin_lock(&old_entry->lock);
- if (!old_entry->i.inode) {
+ if (!old_entry->inode) {
/* old_entry is being shot, lets just lie */
spin_unlock(&old_entry->lock);
fsnotify_put_mark(old_entry);
@@ -394,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
}
fsnotify_duplicate_mark(chunk_entry, old_entry);
- if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
+ if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) {
spin_unlock(&old_entry->lock);
fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
@@ -449,7 +450,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
return 0;
}
-static void audit_log_remove_rule(struct audit_krule *rule)
+static void audit_tree_log_remove_rule(struct audit_krule *rule)
{
struct audit_buffer *ab;
@@ -457,7 +458,7 @@ static void audit_log_remove_rule(struct audit_krule *rule)
if (unlikely(!ab))
return;
audit_log_format(ab, "op=");
- audit_log_string(ab, "remove rule");
+ audit_log_string(ab, "remove_rule");
audit_log_format(ab, " dir=");
audit_log_untrustedstring(ab, rule->tree->pathname);
audit_log_key(ab, rule->filterkey);
@@ -476,7 +477,7 @@ static void kill_rules(struct audit_tree *tree)
list_del_init(&rule->rlist);
if (rule->tree) {
/* not a half-baked one */
- audit_log_remove_rule(rule);
+ audit_tree_log_remove_rule(rule);
rule->tree = NULL;
list_del_rcu(&entry->list);
list_del(&entry->rule.list);
@@ -610,7 +611,7 @@ void audit_trim_trees(void)
list_for_each_entry(node, &tree->chunks, list) {
struct audit_chunk *chunk = find_chunk(node);
/* this could be NULL if the watch is dying else where... */
- struct inode *inode = chunk->mark.i.inode;
+ struct inode *inode = chunk->mark.inode;
node->index |= 1U<<31;
if (iterate_mounts(compare_root, inode, root_mnt))
node->index &= ~(1U<<31);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 70b4554d2fbe..ad9c1682f616 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -314,7 +314,7 @@ static void audit_update_watch(struct audit_parent *parent,
&nentry->rule.list);
}
- audit_watch_log_rule_change(r, owatch, "updated rules");
+ audit_watch_log_rule_change(r, owatch, "updated_rules");
call_rcu(&oentry->rcu, audit_free_rule_rcu);
}
@@ -342,7 +342,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
e = container_of(r, struct audit_entry, rule);
- audit_watch_log_rule_change(r, w, "remove rule");
+ audit_watch_log_rule_change(r, w, "remove_rule");
list_del(&r->rlist);
list_del(&r->list);
list_del_rcu(&e->list);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index c447cd9848d1..4f68a326d92e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -71,6 +71,24 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
DEFINE_MUTEX(audit_filter_mutex);
+static void audit_free_lsm_field(struct audit_field *f)
+{
+ switch (f->type) {
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ kfree(f->lsm_str);
+ security_audit_rule_free(f->lsm_rule);
+ }
+}
+
static inline void audit_free_rule(struct audit_entry *e)
{
int i;
@@ -80,11 +98,8 @@ static inline void audit_free_rule(struct audit_entry *e)
if (erule->watch)
audit_put_watch(erule->watch);
if (erule->fields)
- for (i = 0; i < erule->field_count; i++) {
- struct audit_field *f = &erule->fields[i];
- kfree(f->lsm_str);
- security_audit_rule_free(f->lsm_rule);
- }
+ for (i = 0; i < erule->field_count; i++)
+ audit_free_lsm_field(&erule->fields[i]);
kfree(erule->fields);
kfree(erule->filterkey);
kfree(e);
@@ -148,7 +163,7 @@ static inline int audit_to_inode(struct audit_krule *krule,
struct audit_field *f)
{
if (krule->listnr != AUDIT_FILTER_EXIT ||
- krule->watch || krule->inode_f || krule->tree ||
+ krule->inode_f || krule->watch || krule->tree ||
(f->op != Audit_equal && f->op != Audit_not_equal))
return -EINVAL;
@@ -422,28 +437,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
f->type = data->fields[i];
f->val = data->values[i];
- f->uid = INVALID_UID;
- f->gid = INVALID_GID;
- f->lsm_str = NULL;
- f->lsm_rule = NULL;
/* Support legacy tests for a valid loginuid */
if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
f->type = AUDIT_LOGINUID_SET;
f->val = 0;
- }
-
- if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) {
- struct pid *pid;
- rcu_read_lock();
- pid = find_vpid(f->val);
- if (!pid) {
- rcu_read_unlock();
- err = -ESRCH;
- goto exit_free;
- }
- f->val = pid_nr(pid);
- rcu_read_unlock();
+ entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
}
err = audit_field_valid(entry, f);
@@ -619,6 +618,13 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->buflen += data->values[i] =
audit_pack_string(&bufp, krule->filterkey);
break;
+ case AUDIT_LOGINUID_SET:
+ if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
+ data->fields[i] = AUDIT_LOGINUID;
+ data->values[i] = AUDIT_UID_UNSET;
+ break;
+ }
+ /* fallthrough if set */
default:
data->values[i] = f->val;
}
@@ -635,6 +641,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
int i;
if (a->flags != b->flags ||
+ a->pflags != b->pflags ||
a->listnr != b->listnr ||
a->action != b->action ||
a->field_count != b->field_count)
@@ -753,6 +760,7 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
new = &entry->rule;
new->vers_ops = old->vers_ops;
new->flags = old->flags;
+ new->pflags = old->pflags;
new->listnr = old->listnr;
new->action = old->action;
for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
@@ -1053,30 +1061,27 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
int err = 0;
struct audit_entry *entry;
+ entry = audit_data_to_entry(data, datasz);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
+
switch (type) {
case AUDIT_ADD_RULE:
- entry = audit_data_to_entry(data, datasz);
- if (IS_ERR(entry))
- return PTR_ERR(entry);
-
err = audit_add_rule(entry);
- audit_log_rule_change("add rule", &entry->rule, !err);
- if (err)
- audit_free_rule(entry);
+ audit_log_rule_change("add_rule", &entry->rule, !err);
break;
case AUDIT_DEL_RULE:
- entry = audit_data_to_entry(data, datasz);
- if (IS_ERR(entry))
- return PTR_ERR(entry);
-
err = audit_del_rule(entry);
- audit_log_rule_change("remove rule", &entry->rule, !err);
- audit_free_rule(entry);
+ audit_log_rule_change("remove_rule", &entry->rule, !err);
break;
default:
- return -EINVAL;
+ err = -EINVAL;
+ WARN_ON(1);
}
+ if (err || type == AUDIT_DEL_RULE)
+ audit_free_rule(entry);
+
return err;
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 21eae3c05ec0..072566dd0caf 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -67,10 +67,13 @@
#include <linux/binfmts.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
+#include <asm/syscall.h>
#include <linux/capability.h>
#include <linux/fs_struct.h>
#include <linux/compat.h>
#include <linux/ctype.h>
+#include <linux/string.h>
+#include <uapi/linux/limits.h>
#include "audit.h"
@@ -125,14 +128,6 @@ struct audit_tree_refs {
struct audit_chunk *c[31];
};
-static inline int open_arg(int flags, int mask)
-{
- int n = ACC_MODE(flags);
- if (flags & (O_TRUNC | O_CREAT))
- n |= AUDIT_PERM_WRITE;
- return n & mask;
-}
-
static int audit_match_perm(struct audit_context *ctx, int mask)
{
unsigned n;
@@ -1505,7 +1500,6 @@ void __audit_free(struct task_struct *tsk)
/**
* audit_syscall_entry - fill in an audit record at syscall entry
- * @arch: architecture type
* @major: major syscall type (function)
* @a1: additional syscall register 1
* @a2: additional syscall register 2
@@ -1520,9 +1514,8 @@ void __audit_free(struct task_struct *tsk)
* will only be written if another part of the kernel requests that it
* be written).
*/
-void __audit_syscall_entry(int arch, int major,
- unsigned long a1, unsigned long a2,
- unsigned long a3, unsigned long a4)
+void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4)
{
struct task_struct *tsk = current;
struct audit_context *context = tsk->audit_context;
@@ -1536,7 +1529,7 @@ void __audit_syscall_entry(int arch, int major,
if (!audit_enabled)
return;
- context->arch = arch;
+ context->arch = syscall_get_arch();
context->major = major;
context->argv[0] = a1;
context->argv[1] = a2;
@@ -1870,8 +1863,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
}
list_for_each_entry_reverse(n, &context->names_list, list) {
- /* does the name pointer match? */
- if (!n->name || n->name->name != name->name)
+ if (!n->name || strcmp(n->name->name, name->name))
continue;
/* match the correct record type */
@@ -1886,12 +1878,48 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
}
out_alloc:
- /* unable to find the name from a previous getname(). Allocate a new
- * anonymous entry.
- */
- n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
+ /* unable to find an entry with both a matching name and type */
+ n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
return;
+ /* unfortunately, while we may have a path name to record with the
+ * inode, we can't always rely on the string lasting until the end of
+ * the syscall so we need to create our own copy, it may fail due to
+ * memory allocation issues, but we do our best */
+ if (name) {
+ /* we can't use getname_kernel() due to size limits */
+ size_t len = strlen(name->name) + 1;
+ struct filename *new = __getname();
+
+ if (unlikely(!new))
+ goto out;
+
+ if (len <= (PATH_MAX - sizeof(*new))) {
+ new->name = (char *)(new) + sizeof(*new);
+ new->separate = false;
+ } else if (len <= PATH_MAX) {
+ /* this looks odd, but is due to final_putname() */
+ struct filename *new2;
+
+ new2 = kmalloc(sizeof(*new2), GFP_KERNEL);
+ if (unlikely(!new2)) {
+ __putname(new);
+ goto out;
+ }
+ new2->name = (char *)new;
+ new2->separate = true;
+ new = new2;
+ } else {
+ /* we should never get here, but let's be safe */
+ __putname(new);
+ goto out;
+ }
+ strlcpy((char *)new->name, name->name, len);
+ new->uptr = NULL;
+ new->aname = n;
+ n->name = new;
+ n->name_put = true;
+ }
out:
if (parent) {
n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
@@ -1906,6 +1934,11 @@ out:
audit_copy_inode(n, dentry, inode);
}
+void __audit_file(const struct file *file)
+{
+ __audit_inode(NULL, file->f_path.dentry, 0);
+}
+
/**
* __audit_inode_child - collect inode info for created/removed objects
* @parent: inode of dentry parent
@@ -2382,7 +2415,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- dentry = dget(bprm->file->f_dentry);
+ dentry = dget(bprm->file->f_path.dentry);
get_vfs_caps_from_disk(dentry, &vcaps);
dput(dentry);
@@ -2406,7 +2439,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
* @new: the new credentials
* @old: the old (current) credentials
*
- * Record the aguments userspace sent to sys_capset for later printing by the
+ * Record the arguments userspace sent to sys_capset for later printing by the
* audit system if applicable
*/
void __audit_log_capset(const struct cred *new, const struct cred *old)
@@ -2433,6 +2466,7 @@ static void audit_log_task(struct audit_buffer *ab)
kgid_t gid;
unsigned int sessionid;
struct mm_struct *mm = current->mm;
+ char comm[sizeof(current->comm)];
auid = audit_get_loginuid(current);
sessionid = audit_get_sessionid(current);
@@ -2445,7 +2479,7 @@ static void audit_log_task(struct audit_buffer *ab)
sessionid);
audit_log_task_context(ab);
audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
- audit_log_untrustedstring(ab, current->comm);
+ audit_log_untrustedstring(ab, get_task_comm(comm, current));
if (mm) {
down_read(&mm->mmap_sem);
if (mm->exe_file)
@@ -2488,11 +2522,9 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
if (unlikely(!ab))
return;
audit_log_task(ab);
- audit_log_format(ab, " sig=%ld", signr);
- audit_log_format(ab, " syscall=%ld", syscall);
- audit_log_format(ab, " compat=%d", is_compat_task());
- audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
- audit_log_format(ab, " code=0x%x", code);
+ audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x",
+ signr, syscall_get_arch(), syscall, is_compat_task(),
+ KSTK_EIP(current), code);
audit_log_end(ab);
}
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 6a71145e2769..a5ae60f0b0a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1,5 @@
obj-y := core.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
+ifdef CONFIG_TEST_BPF
+obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
+endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 000000000000..9eb4d8a7cd87
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,156 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+struct bpf_array {
+ struct bpf_map map;
+ u32 elem_size;
+ char value[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_array *array;
+ u32 elem_size, array_size;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size == 0)
+ return ERR_PTR(-EINVAL);
+
+ elem_size = round_up(attr->value_size, 8);
+
+ /* check round_up into zero and u32 overflow */
+ if (elem_size == 0 ||
+ attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size)
+ return ERR_PTR(-ENOMEM);
+
+ array_size = sizeof(*array) + attr->max_entries * elem_size;
+
+ /* allocate all map elements and zero-initialize them */
+ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
+ if (!array) {
+ array = vzalloc(array_size);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* copy mandatory map attributes */
+ array->map.key_size = attr->key_size;
+ array->map.value_size = attr->value_size;
+ array->map.max_entries = attr->max_entries;
+
+ array->elem_size = elem_size;
+
+ return &array->map;
+}
+
+/* Called from syscall or from eBPF program */
+static void *array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (index >= array->map.max_entries)
+ return NULL;
+
+ return array->value + array->elem_size * index;
+}
+
+/* Called from syscall */
+static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ u32 *next = (u32 *)next_key;
+
+ if (index >= array->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == array->map.max_entries - 1)
+ return -ENOENT;
+
+ *next = index + 1;
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (map_flags > BPF_EXIST)
+ /* unknown flags */
+ return -EINVAL;
+
+ if (index >= array->map.max_entries)
+ /* all elements were pre-allocated, cannot insert a new one */
+ return -E2BIG;
+
+ if (map_flags == BPF_NOEXIST)
+ /* all elements already exist */
+ return -EEXIST;
+
+ memcpy(array->value + array->elem_size * index, value, array->elem_size);
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void array_map_free(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding programs to complete
+ * and free the array
+ */
+ synchronize_rcu();
+
+ kvfree(array);
+}
+
+static struct bpf_map_ops array_ops = {
+ .map_alloc = array_map_alloc,
+ .map_free = array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = array_map_lookup_elem,
+ .map_update_elem = array_map_update_elem,
+ .map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list tl = {
+ .ops = &array_ops,
+ .type = BPF_MAP_TYPE_ARRAY,
+};
+
+static int __init register_array_map(void)
+{
+ bpf_register_map_type(&tl);
+ return 0;
+}
+late_initcall(register_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7f0dbcbb34af..d6594e457a25 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -20,9 +20,14 @@
* Andi Kleen - Fix a few bad bugs and races.
* Kris Katterjohn - Added many additional checks in bpf_check_classic()
*/
+
#include <linux/filter.h>
#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/moduleloader.h>
#include <asm/unaligned.h>
+#include <linux/bpf.h>
/* Registers */
#define BPF_R0 regs[BPF_REG_0]
@@ -63,6 +68,105 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
return NULL;
}
+struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+ gfp_extra_flags;
+ struct bpf_prog_aux *aux;
+ struct bpf_prog *fp;
+
+ size = round_up(size, PAGE_SIZE);
+ fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ if (fp == NULL)
+ return NULL;
+
+ aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
+ if (aux == NULL) {
+ vfree(fp);
+ return NULL;
+ }
+
+ fp->pages = size / PAGE_SIZE;
+ fp->aux = aux;
+
+ return fp;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_alloc);
+
+struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
+ gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+ gfp_extra_flags;
+ struct bpf_prog *fp;
+
+ BUG_ON(fp_old == NULL);
+
+ size = round_up(size, PAGE_SIZE);
+ if (size <= fp_old->pages * PAGE_SIZE)
+ return fp_old;
+
+ fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ if (fp != NULL) {
+ memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
+ fp->pages = size / PAGE_SIZE;
+
+ /* We keep fp->aux from fp_old around in the new
+ * reallocated structure.
+ */
+ fp_old->aux = NULL;
+ __bpf_prog_free(fp_old);
+ }
+
+ return fp;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_realloc);
+
+void __bpf_prog_free(struct bpf_prog *fp)
+{
+ kfree(fp->aux);
+ vfree(fp);
+}
+EXPORT_SYMBOL_GPL(__bpf_prog_free);
+
+#ifdef CONFIG_BPF_JIT
+struct bpf_binary_header *
+bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_binary_header *hdr;
+ unsigned int size, hole, start;
+
+ /* Most of BPF filters are really small, but if some of them
+ * fill a page, allow at least 128 extra bytes to insert a
+ * random section of illegal instructions.
+ */
+ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
+ hdr = module_alloc(size);
+ if (hdr == NULL)
+ return NULL;
+
+ /* Fill space with illegal/arch-dep instructions. */
+ bpf_fill_ill_insns(hdr, size);
+
+ hdr->pages = size / PAGE_SIZE;
+ hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
+ PAGE_SIZE - sizeof(*hdr));
+ start = (prandom_u32() % hole) & ~(alignment - 1);
+
+ /* Leave a random number of instructions before BPF code. */
+ *image_ptr = &hdr->image[start];
+
+ return hdr;
+}
+
+void bpf_jit_binary_free(struct bpf_binary_header *hdr)
+{
+ module_free(NULL, hdr);
+}
+#endif /* CONFIG_BPF_JIT */
+
/* Base function for offset calculation. Needs to go into .text section,
* therefore keeping it non-static as well; will also be used by JITs
* anyway later on, so do not let the compiler omit it.
@@ -180,6 +284,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
[BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
[BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
+ [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
};
void *ptr;
int off;
@@ -239,6 +344,10 @@ select_insn:
ALU64_MOV_K:
DST = IMM;
CONT;
+ LD_IMM_DW:
+ DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
+ insn++;
+ CONT;
ALU64_ARSH_X:
(*(s64 *) &DST) >>= SRC;
CONT;
@@ -523,12 +632,35 @@ void bpf_prog_select_runtime(struct bpf_prog *fp)
/* Probe if internal BPF can be JITed */
bpf_int_jit_compile(fp);
+ /* Lock whole bpf_prog as read-only */
+ bpf_prog_lock_ro(fp);
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
-/* free internal BPF program */
+static void bpf_prog_free_deferred(struct work_struct *work)
+{
+ struct bpf_prog_aux *aux;
+
+ aux = container_of(work, struct bpf_prog_aux, work);
+ bpf_jit_free(aux->prog);
+}
+
+/* Free internal BPF program */
void bpf_prog_free(struct bpf_prog *fp)
{
- bpf_jit_free(fp);
+ struct bpf_prog_aux *aux = fp->aux;
+
+ INIT_WORK(&aux->work, bpf_prog_free_deferred);
+ aux->prog = fp;
+ schedule_work(&aux->work);
}
EXPORT_SYMBOL_GPL(bpf_prog_free);
+
+/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
+ * skb_copy_bits(), so provide a weak definition of it for NET-less config.
+ */
+int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
+ int len)
+{
+ return -EFAULT;
+}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000000..b3ba43674310
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,367 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+
+struct bpf_htab {
+ struct bpf_map map;
+ struct hlist_head *buckets;
+ spinlock_t lock;
+ u32 count; /* number of elements in this hashtable */
+ u32 n_buckets; /* number of hash buckets */
+ u32 elem_size; /* size of each element in bytes */
+};
+
+/* each htab element is struct htab_elem + key + value */
+struct htab_elem {
+ struct hlist_node hash_node;
+ struct rcu_head rcu;
+ u32 hash;
+ char key[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_htab *htab;
+ int err, i;
+
+ htab = kzalloc(sizeof(*htab), GFP_USER);
+ if (!htab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ htab->map.key_size = attr->key_size;
+ htab->map.value_size = attr->value_size;
+ htab->map.max_entries = attr->max_entries;
+
+ /* check sanity of attributes.
+ * value_size == 0 may be allowed in the future to use map as a set
+ */
+ err = -EINVAL;
+ if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
+ htab->map.value_size == 0)
+ goto free_htab;
+
+ /* hash table size must be power of 2 */
+ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+
+ err = -E2BIG;
+ if (htab->map.key_size > MAX_BPF_STACK)
+ /* eBPF programs initialize keys on stack, so they cannot be
+ * larger than max stack size
+ */
+ goto free_htab;
+
+ err = -ENOMEM;
+ /* prevent zero size kmalloc and check for u32 overflow */
+ if (htab->n_buckets == 0 ||
+ htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+ goto free_htab;
+
+ htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+ GFP_USER | __GFP_NOWARN);
+
+ if (!htab->buckets) {
+ htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+ if (!htab->buckets)
+ goto free_htab;
+ }
+
+ for (i = 0; i < htab->n_buckets; i++)
+ INIT_HLIST_HEAD(&htab->buckets[i]);
+
+ spin_lock_init(&htab->lock);
+ htab->count = 0;
+
+ htab->elem_size = sizeof(struct htab_elem) +
+ round_up(htab->map.key_size, 8) +
+ htab->map.value_size;
+ return &htab->map;
+
+free_htab:
+ kfree(htab);
+ return ERR_PTR(err);
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+ void *key, u32 key_size)
+{
+ struct htab_elem *l;
+
+ hlist_for_each_entry_rcu(l, head, hash_node)
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+
+ return NULL;
+}
+
+/* Called from syscall or from eBPF program */
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ u32 hash, key_size;
+
+ /* Must be called with rcu_read_lock. */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ head = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l)
+ return l->key + round_up(map->key_size, 8);
+
+ return NULL;
+}
+
+/* Called from syscall */
+static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l, *next_l;
+ u32 hash, key_size;
+ int i;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ head = select_bucket(htab, hash);
+
+ /* lookup the key */
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (!l) {
+ i = 0;
+ goto find_first_elem;
+ }
+
+ /* key was found, get next key in the same bucket */
+ next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+ struct htab_elem, hash_node);
+
+ if (next_l) {
+ /* if next elem in this hash list is non-zero, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+
+ /* no more elements in this hash list, go to the next bucket */
+ i = hash & (htab->n_buckets - 1);
+ i++;
+
+find_first_elem:
+ /* iterate over buckets */
+ for (; i < htab->n_buckets; i++) {
+ head = select_bucket(htab, i);
+
+ /* pick first element in the bucket */
+ next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ struct htab_elem, hash_node);
+ if (next_l) {
+ /* if it's not empty, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+ }
+
+ /* itereated over all buckets and all elements */
+ return -ENOENT;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new, *l_old;
+ struct hlist_head *head;
+ unsigned long flags;
+ u32 key_size;
+ int ret;
+
+ if (map_flags > BPF_EXIST)
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* allocate new element outside of lock */
+ l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
+ if (!l_new)
+ return -ENOMEM;
+
+ key_size = map->key_size;
+
+ memcpy(l_new->key, key, key_size);
+ memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+
+ l_new->hash = htab_map_hash(l_new->key, key_size);
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ spin_lock_irqsave(&htab->lock, flags);
+
+ head = select_bucket(htab, l_new->hash);
+
+ l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+
+ if (!l_old && unlikely(htab->count >= map->max_entries)) {
+ /* if elem with this 'key' doesn't exist and we've reached
+ * max_entries limit, fail insertion of new elem
+ */
+ ret = -E2BIG;
+ goto err;
+ }
+
+ if (l_old && map_flags == BPF_NOEXIST) {
+ /* elem already exists */
+ ret = -EEXIST;
+ goto err;
+ }
+
+ if (!l_old && map_flags == BPF_EXIST) {
+ /* elem doesn't exist, cannot update it */
+ ret = -ENOENT;
+ goto err;
+ }
+
+ /* add new element to the head of the list, so that concurrent
+ * search will find it before old elem
+ */
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ if (l_old) {
+ hlist_del_rcu(&l_old->hash_node);
+ kfree_rcu(l_old, rcu);
+ } else {
+ htab->count++;
+ }
+ spin_unlock_irqrestore(&htab->lock, flags);
+
+ return 0;
+err:
+ spin_unlock_irqrestore(&htab->lock, flags);
+ kfree(l_new);
+ return ret;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ unsigned long flags;
+ u32 hash, key_size;
+ int ret = -ENOENT;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ spin_lock_irqsave(&htab->lock, flags);
+
+ head = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l) {
+ hlist_del_rcu(&l->hash_node);
+ htab->count--;
+ kfree_rcu(l, rcu);
+ ret = 0;
+ }
+
+ spin_unlock_irqrestore(&htab->lock, flags);
+ return ret;
+}
+
+static void delete_all_elements(struct bpf_htab *htab)
+{
+ int i;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_head *head = select_bucket(htab, i);
+ struct hlist_node *n;
+ struct htab_elem *l;
+
+ hlist_for_each_entry_safe(l, n, head, hash_node) {
+ hlist_del_rcu(&l->hash_node);
+ htab->count--;
+ kfree(l);
+ }
+ }
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void htab_map_free(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete
+ */
+ synchronize_rcu();
+
+ /* some of kfree_rcu() callbacks for elements of this map may not have
+ * executed. It's ok. Proceed to free residual elements and map itself
+ */
+ delete_all_elements(htab);
+ kvfree(htab->buckets);
+ kfree(htab);
+}
+
+static struct bpf_map_ops htab_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_map_lookup_elem,
+ .map_update_elem = htab_map_update_elem,
+ .map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list tl = {
+ .ops = &htab_ops,
+ .type = BPF_MAP_TYPE_HASH,
+};
+
+static int __init register_htab_map(void)
+{
+ bpf_register_map_type(&tl);
+ return 0;
+}
+late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000000..9e3414d85459
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/rcupdate.h>
+
+/* If kernel subsystem is allowing eBPF programs to call this function,
+ * inside its own verifier_ops->get_func_proto() callback it should return
+ * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
+ *
+ * Different map implementations will rely on rcu in map methods
+ * lookup/update/delete, therefore eBPF programs must run under rcu lock
+ * if program is allowed to access maps, so check rcu_read_lock_held in
+ * all three functions.
+ */
+static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* verifier checked that R1 contains a valid pointer to bpf_map
+ * and R2 points to a program stack and map->key_size bytes were
+ * initialized
+ */
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+ void *value;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ value = map->ops->map_lookup_elem(map, key);
+
+ /* lookup() returns either pointer to element value or NULL
+ * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
+ */
+ return (unsigned long) value;
+}
+
+struct bpf_func_proto bpf_map_lookup_elem_proto = {
+ .func = bpf_map_lookup_elem,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+ void *value = (void *) (unsigned long) r3;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return map->ops->map_update_elem(map, key, value, r4);
+}
+
+struct bpf_func_proto bpf_map_update_elem_proto = {
+ .func = bpf_map_update_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return map->ops->map_delete_elem(map, key);
+}
+
+struct bpf_func_proto bpf_map_delete_elem_proto = {
+ .func = bpf_map_delete_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
new file mode 100644
index 000000000000..088ac0b1b106
--- /dev/null
+++ b/kernel/bpf/syscall.c
@@ -0,0 +1,606 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/license.h>
+#include <linux/filter.h>
+
+static LIST_HEAD(bpf_map_types);
+
+static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
+{
+ struct bpf_map_type_list *tl;
+ struct bpf_map *map;
+
+ list_for_each_entry(tl, &bpf_map_types, list_node) {
+ if (tl->type == attr->map_type) {
+ map = tl->ops->map_alloc(attr);
+ if (IS_ERR(map))
+ return map;
+ map->ops = tl->ops;
+ map->map_type = attr->map_type;
+ return map;
+ }
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+/* boot time registration of different map implementations */
+void bpf_register_map_type(struct bpf_map_type_list *tl)
+{
+ list_add(&tl->list_node, &bpf_map_types);
+}
+
+/* called from workqueue */
+static void bpf_map_free_deferred(struct work_struct *work)
+{
+ struct bpf_map *map = container_of(work, struct bpf_map, work);
+
+ /* implementation dependent freeing */
+ map->ops->map_free(map);
+}
+
+/* decrement map refcnt and schedule it for freeing via workqueue
+ * (unrelying map implementation ops->map_free() might sleep)
+ */
+void bpf_map_put(struct bpf_map *map)
+{
+ if (atomic_dec_and_test(&map->refcnt)) {
+ INIT_WORK(&map->work, bpf_map_free_deferred);
+ schedule_work(&map->work);
+ }
+}
+
+static int bpf_map_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_map *map = filp->private_data;
+
+ bpf_map_put(map);
+ return 0;
+}
+
+static const struct file_operations bpf_map_fops = {
+ .release = bpf_map_release,
+};
+
+/* helper macro to check that unused fields 'union bpf_attr' are zero */
+#define CHECK_ATTR(CMD) \
+ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
+ sizeof(attr->CMD##_LAST_FIELD), 0, \
+ sizeof(*attr) - \
+ offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
+ sizeof(attr->CMD##_LAST_FIELD)) != NULL
+
+#define BPF_MAP_CREATE_LAST_FIELD max_entries
+/* called via syscall */
+static int map_create(union bpf_attr *attr)
+{
+ struct bpf_map *map;
+ int err;
+
+ err = CHECK_ATTR(BPF_MAP_CREATE);
+ if (err)
+ return -EINVAL;
+
+ /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
+ map = find_and_alloc_map(attr);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ atomic_set(&map->refcnt, 1);
+
+ err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
+
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_map;
+
+ return err;
+
+free_map:
+ map->ops->map_free(map);
+ return err;
+}
+
+/* if error is returned, fd is released.
+ * On success caller should complete fd access with matching fdput()
+ */
+struct bpf_map *bpf_map_get(struct fd f)
+{
+ struct bpf_map *map;
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &bpf_map_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ map = f.file->private_data;
+
+ return map;
+}
+
+/* helper to convert user pointers passed inside __aligned_u64 fields */
+static void __user *u64_to_ptr(__u64 val)
+{
+ return (void __user *) (unsigned long) val;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+
+static int map_lookup_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *uvalue = u64_to_ptr(attr->value);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *value;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOENT;
+ rcu_read_lock();
+ value = map->ops->map_lookup_elem(map, key);
+ if (!value)
+ goto err_unlock;
+
+ err = -EFAULT;
+ if (copy_to_user(uvalue, value, map->value_size) != 0)
+ goto err_unlock;
+
+ err = 0;
+
+err_unlock:
+ rcu_read_unlock();
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
+
+static int map_update_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *uvalue = u64_to_ptr(attr->value);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *value;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ value = kmalloc(map->value_size, GFP_USER);
+ if (!value)
+ goto free_key;
+
+ err = -EFAULT;
+ if (copy_from_user(value, uvalue, map->value_size) != 0)
+ goto free_value;
+
+ /* eBPF program that use maps are running under rcu_read_lock(),
+ * therefore all map accessors rely on this fact, so do the same here
+ */
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, attr->flags);
+ rcu_read_unlock();
+
+free_value:
+ kfree(value);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
+
+static int map_delete_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_delete_elem(map, key);
+ rcu_read_unlock();
+
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
+
+static int map_get_next_key(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *unext_key = u64_to_ptr(attr->next_key);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *next_key;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ next_key = kmalloc(map->key_size, GFP_USER);
+ if (!next_key)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_get_next_key(map, key, next_key);
+ rcu_read_unlock();
+ if (err)
+ goto free_next_key;
+
+ err = -EFAULT;
+ if (copy_to_user(unext_key, next_key, map->key_size) != 0)
+ goto free_next_key;
+
+ err = 0;
+
+free_next_key:
+ kfree(next_key);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+static LIST_HEAD(bpf_prog_types);
+
+static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
+{
+ struct bpf_prog_type_list *tl;
+
+ list_for_each_entry(tl, &bpf_prog_types, list_node) {
+ if (tl->type == type) {
+ prog->aux->ops = tl->ops;
+ prog->aux->prog_type = type;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+{
+ list_add(&tl->list_node, &bpf_prog_types);
+}
+
+/* fixup insn->imm field of bpf_call instructions:
+ * if (insn->imm == BPF_FUNC_map_lookup_elem)
+ * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
+ * else if (insn->imm == BPF_FUNC_map_update_elem)
+ * insn->imm = bpf_map_update_elem - __bpf_call_base;
+ * else ...
+ *
+ * this function is called after eBPF program passed verification
+ */
+static void fixup_bpf_calls(struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *fn;
+ int i;
+
+ for (i = 0; i < prog->len; i++) {
+ struct bpf_insn *insn = &prog->insnsi[i];
+
+ if (insn->code == (BPF_JMP | BPF_CALL)) {
+ /* we reach here when program has bpf_call instructions
+ * and it passed bpf_check(), means that
+ * ops->get_func_proto must have been supplied, check it
+ */
+ BUG_ON(!prog->aux->ops->get_func_proto);
+
+ fn = prog->aux->ops->get_func_proto(insn->imm);
+ /* all functions that have prototype and verifier allowed
+ * programs to call them, must be real in-kernel functions
+ */
+ BUG_ON(!fn->func);
+ insn->imm = fn->func - __bpf_call_base;
+ }
+ }
+}
+
+/* drop refcnt on maps used by eBPF program and free auxilary data */
+static void free_used_maps(struct bpf_prog_aux *aux)
+{
+ int i;
+
+ for (i = 0; i < aux->used_map_cnt; i++)
+ bpf_map_put(aux->used_maps[i]);
+
+ kfree(aux->used_maps);
+}
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+ if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ free_used_maps(prog->aux);
+ bpf_prog_free(prog);
+ }
+}
+
+static int bpf_prog_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_prog *prog = filp->private_data;
+
+ bpf_prog_put(prog);
+ return 0;
+}
+
+static const struct file_operations bpf_prog_fops = {
+ .release = bpf_prog_release,
+};
+
+static struct bpf_prog *get_prog(struct fd f)
+{
+ struct bpf_prog *prog;
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &bpf_prog_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ prog = f.file->private_data;
+
+ return prog;
+}
+
+/* called by sockets/tracing/seccomp before attaching program to an event
+ * pairs with bpf_prog_put()
+ */
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+ struct fd f = fdget(ufd);
+ struct bpf_prog *prog;
+
+ prog = get_prog(f);
+
+ if (IS_ERR(prog))
+ return prog;
+
+ atomic_inc(&prog->aux->refcnt);
+ fdput(f);
+ return prog;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_PROG_LOAD_LAST_FIELD log_buf
+
+static int bpf_prog_load(union bpf_attr *attr)
+{
+ enum bpf_prog_type type = attr->prog_type;
+ struct bpf_prog *prog;
+ int err;
+ char license[128];
+ bool is_gpl;
+
+ if (CHECK_ATTR(BPF_PROG_LOAD))
+ return -EINVAL;
+
+ /* copy eBPF program license from user space */
+ if (strncpy_from_user(license, u64_to_ptr(attr->license),
+ sizeof(license) - 1) < 0)
+ return -EFAULT;
+ license[sizeof(license) - 1] = 0;
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ is_gpl = license_is_gpl_compatible(license);
+
+ if (attr->insn_cnt >= BPF_MAXINSNS)
+ return -EINVAL;
+
+ /* plain bpf_prog allocation */
+ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
+ if (!prog)
+ return -ENOMEM;
+
+ prog->len = attr->insn_cnt;
+
+ err = -EFAULT;
+ if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
+ prog->len * sizeof(struct bpf_insn)) != 0)
+ goto free_prog;
+
+ prog->orig_prog = NULL;
+ prog->jited = false;
+
+ atomic_set(&prog->aux->refcnt, 1);
+ prog->aux->is_gpl_compatible = is_gpl;
+
+ /* find program type: socket_filter vs tracing_filter */
+ err = find_prog_type(type, prog);
+ if (err < 0)
+ goto free_prog;
+
+ /* run eBPF verifier */
+ err = bpf_check(prog, attr);
+
+ if (err < 0)
+ goto free_used_maps;
+
+ /* fixup BPF_CALL->imm field */
+ fixup_bpf_calls(prog);
+
+ /* eBPF program is ready to be JITed */
+ bpf_prog_select_runtime(prog);
+
+ err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
+
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_used_maps;
+
+ return err;
+
+free_used_maps:
+ free_used_maps(prog->aux);
+free_prog:
+ bpf_prog_free(prog);
+ return err;
+}
+
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+ union bpf_attr attr = {};
+ int err;
+
+ /* the syscall is limited to root temporarily. This restriction will be
+ * lifted when security audit is clean. Note that eBPF+tracing must have
+ * this restriction, since it may pass kernel data to user space
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!access_ok(VERIFY_READ, uattr, 1))
+ return -EFAULT;
+
+ if (size > PAGE_SIZE) /* silly large */
+ return -E2BIG;
+
+ /* If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
+ */
+ if (size > sizeof(attr)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uattr + sizeof(attr);
+ end = (void __user *)uattr + size;
+
+ for (; addr < end; addr++) {
+ err = get_user(val, addr);
+ if (err)
+ return err;
+ if (val)
+ return -E2BIG;
+ }
+ size = sizeof(attr);
+ }
+
+ /* copy attributes from user space, may be less than sizeof(bpf_attr) */
+ if (copy_from_user(&attr, uattr, size) != 0)
+ return -EFAULT;
+
+ switch (cmd) {
+ case BPF_MAP_CREATE:
+ err = map_create(&attr);
+ break;
+ case BPF_MAP_LOOKUP_ELEM:
+ err = map_lookup_elem(&attr);
+ break;
+ case BPF_MAP_UPDATE_ELEM:
+ err = map_update_elem(&attr);
+ break;
+ case BPF_MAP_DELETE_ELEM:
+ err = map_delete_elem(&attr);
+ break;
+ case BPF_MAP_GET_NEXT_KEY:
+ err = map_get_next_key(&attr);
+ break;
+ case BPF_PROG_LOAD:
+ err = bpf_prog_load(&attr);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ return err;
+}
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
new file mode 100644
index 000000000000..0ceae1e6e8b5
--- /dev/null
+++ b/kernel/bpf/test_stub.c
@@ -0,0 +1,78 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/bpf.h>
+
+/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC
+ * to be used by user space verifier testsuite
+ */
+struct bpf_context {
+ u64 arg1;
+ u64 arg2;
+};
+
+static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ default:
+ return NULL;
+ }
+}
+
+static const struct bpf_context_access {
+ int size;
+ enum bpf_access_type type;
+} test_ctx_access[] = {
+ [offsetof(struct bpf_context, arg1)] = {
+ FIELD_SIZEOF(struct bpf_context, arg1),
+ BPF_READ
+ },
+ [offsetof(struct bpf_context, arg2)] = {
+ FIELD_SIZEOF(struct bpf_context, arg2),
+ BPF_READ
+ },
+};
+
+static bool test_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ const struct bpf_context_access *access;
+
+ if (off < 0 || off >= ARRAY_SIZE(test_ctx_access))
+ return false;
+
+ access = &test_ctx_access[off];
+ if (access->size == size && (access->type & type))
+ return true;
+
+ return false;
+}
+
+static struct bpf_verifier_ops test_ops = {
+ .get_func_proto = test_func_proto,
+ .is_valid_access = test_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl_prog = {
+ .ops = &test_ops,
+ .type = BPF_PROG_TYPE_UNSPEC,
+};
+
+static int __init register_test_ops(void)
+{
+ bpf_register_prog_type(&tl_prog);
+ return 0;
+}
+late_initcall(register_test_ops);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
new file mode 100644
index 000000000000..a28e09c7825d
--- /dev/null
+++ b/kernel/bpf/verifier.c
@@ -0,0 +1,2003 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <net/netlink.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+
+/* bpf_check() is a static code analyzer that walks eBPF program
+ * instruction by instruction and updates register/stack state.
+ * All paths of conditional branches are analyzed until 'bpf_exit' insn.
+ *
+ * The first pass is depth-first-search to check that the program is a DAG.
+ * It rejects the following programs:
+ * - larger than BPF_MAXINSNS insns
+ * - if loop is present (detected via back-edge)
+ * - unreachable insns exist (shouldn't be a forest. program = one function)
+ * - out of bounds or malformed jumps
+ * The second pass is all possible path descent from the 1st insn.
+ * Since it's analyzing all pathes through the program, the length of the
+ * analysis is limited to 32k insn, which may be hit even if total number of
+ * insn is less then 4K, but there are too many branches that change stack/regs.
+ * Number of 'branches to be analyzed' is limited to 1k
+ *
+ * On entry to each instruction, each register has a type, and the instruction
+ * changes the types of the registers depending on instruction semantics.
+ * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
+ * copied to R1.
+ *
+ * All registers are 64-bit.
+ * R0 - return register
+ * R1-R5 argument passing registers
+ * R6-R9 callee saved registers
+ * R10 - frame pointer read-only
+ *
+ * At the start of BPF program the register R1 contains a pointer to bpf_context
+ * and has type PTR_TO_CTX.
+ *
+ * Verifier tracks arithmetic operations on pointers in case:
+ * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
+ * 1st insn copies R10 (which has FRAME_PTR) type into R1
+ * and 2nd arithmetic instruction is pattern matched to recognize
+ * that it wants to construct a pointer to some element within stack.
+ * So after 2nd insn, the register R1 has type PTR_TO_STACK
+ * (and -20 constant is saved for further stack bounds checking).
+ * Meaning that this reg is a pointer to stack plus known immediate constant.
+ *
+ * Most of the time the registers have UNKNOWN_VALUE type, which
+ * means the register has some value, but it's not a valid pointer.
+ * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ *
+ * When verifier sees load or store instructions the type of base register
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
+ * types recognized by check_mem_access() function.
+ *
+ * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
+ * and the range of [ptr, ptr + map's value_size) is accessible.
+ *
+ * registers used to pass values to function calls are checked against
+ * function argument constraints.
+ *
+ * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
+ * It means that the register type passed to this function must be
+ * PTR_TO_STACK and it will be used inside the function as
+ * 'pointer to map element key'
+ *
+ * For example the argument constraints for bpf_map_lookup_elem():
+ * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ * .arg1_type = ARG_CONST_MAP_PTR,
+ * .arg2_type = ARG_PTR_TO_MAP_KEY,
+ *
+ * ret_type says that this function returns 'pointer to map elem value or null'
+ * function expects 1st argument to be a const pointer to 'struct bpf_map' and
+ * 2nd argument should be a pointer to stack, which will be used inside
+ * the helper function as a pointer to map element key.
+ *
+ * On the kernel side the helper function looks like:
+ * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+ * {
+ * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ * void *key = (void *) (unsigned long) r2;
+ * void *value;
+ *
+ * here kernel can access 'key' and 'map' pointers safely, knowing that
+ * [key, key + map->key_size) bytes are valid and were initialized on
+ * the stack of eBPF program.
+ * }
+ *
+ * Corresponding eBPF program may look like:
+ * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR
+ * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
+ * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP
+ * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ * here verifier looks at prototype of map_lookup_elem() and sees:
+ * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
+ * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
+ *
+ * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
+ * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
+ * and were initialized prior to this call.
+ * If it's ok, then verifier allows this BPF_CALL insn and looks at
+ * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
+ * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
+ * returns ether pointer to map value or NULL.
+ *
+ * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
+ * insn, the register holding that pointer in the true branch changes state to
+ * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
+ * branch. See check_cond_jmp_op().
+ *
+ * After the call R0 is set to return type of the function and registers R1-R5
+ * are set to NOT_INIT to indicate that they are no longer readable.
+ */
+
+/* types of values stored in eBPF registers */
+enum bpf_reg_type {
+ NOT_INIT = 0, /* nothing was written into register */
+ UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */
+ PTR_TO_CTX, /* reg points to bpf_context */
+ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
+ PTR_TO_MAP_VALUE, /* reg points to map element value */
+ PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
+ FRAME_PTR, /* reg == frame_pointer */
+ PTR_TO_STACK, /* reg == frame_pointer + imm */
+ CONST_IMM, /* constant integer value */
+};
+
+struct reg_state {
+ enum bpf_reg_type type;
+ union {
+ /* valid when type == CONST_IMM | PTR_TO_STACK */
+ int imm;
+
+ /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
+ * PTR_TO_MAP_VALUE_OR_NULL
+ */
+ struct bpf_map *map_ptr;
+ };
+};
+
+enum bpf_stack_slot_type {
+ STACK_INVALID, /* nothing was stored in this stack slot */
+ STACK_SPILL, /* register spilled into stack */
+ STACK_MISC /* BPF program wrote some data into this slot */
+};
+
+#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
+
+/* state of the program:
+ * type of all registers and stack info
+ */
+struct verifier_state {
+ struct reg_state regs[MAX_BPF_REG];
+ u8 stack_slot_type[MAX_BPF_STACK];
+ struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
+};
+
+/* linked list of verifier states used to prune search */
+struct verifier_state_list {
+ struct verifier_state state;
+ struct verifier_state_list *next;
+};
+
+/* verifier_state + insn_idx are pushed to stack when branch is encountered */
+struct verifier_stack_elem {
+ /* verifer state is 'st'
+ * before processing instruction 'insn_idx'
+ * and after processing instruction 'prev_insn_idx'
+ */
+ struct verifier_state st;
+ int insn_idx;
+ int prev_insn_idx;
+ struct verifier_stack_elem *next;
+};
+
+#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+
+/* single container for all structs
+ * one verifier_env per bpf_check() call
+ */
+struct verifier_env {
+ struct bpf_prog *prog; /* eBPF program being verified */
+ struct verifier_stack_elem *head; /* stack of verifier states to be processed */
+ int stack_size; /* number of states to be processed */
+ struct verifier_state cur_state; /* current verifier state */
+ struct verifier_state_list **explored_states; /* search pruning optimization */
+ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
+ u32 used_map_cnt; /* number of used maps */
+};
+
+/* verbose verifier prints what it's seeing
+ * bpf_check() is called under lock, so no race to access these global vars
+ */
+static u32 log_level, log_size, log_len;
+static char *log_buf;
+
+static DEFINE_MUTEX(bpf_verifier_lock);
+
+/* log_level controls verbosity level of eBPF verifier.
+ * verbose() is used to dump the verification trace to the log, so the user
+ * can figure out what's wrong with the program
+ */
+static void verbose(const char *fmt, ...)
+{
+ va_list args;
+
+ if (log_level == 0 || log_len >= log_size - 1)
+ return;
+
+ va_start(args, fmt);
+ log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+ va_end(args);
+}
+
+/* string representation of 'enum bpf_reg_type' */
+static const char * const reg_type_str[] = {
+ [NOT_INIT] = "?",
+ [UNKNOWN_VALUE] = "inv",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
+ [FRAME_PTR] = "fp",
+ [PTR_TO_STACK] = "fp",
+ [CONST_IMM] = "imm",
+};
+
+static void print_verifier_state(struct verifier_env *env)
+{
+ enum bpf_reg_type t;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ t = env->cur_state.regs[i].type;
+ if (t == NOT_INIT)
+ continue;
+ verbose(" R%d=%s", i, reg_type_str[t]);
+ if (t == CONST_IMM || t == PTR_TO_STACK)
+ verbose("%d", env->cur_state.regs[i].imm);
+ else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
+ t == PTR_TO_MAP_VALUE_OR_NULL)
+ verbose("(ks=%d,vs=%d)",
+ env->cur_state.regs[i].map_ptr->key_size,
+ env->cur_state.regs[i].map_ptr->value_size);
+ }
+ for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+ if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
+ verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+ reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
+ }
+ verbose("\n");
+}
+
+static const char *const bpf_class_string[] = {
+ [BPF_LD] = "ld",
+ [BPF_LDX] = "ldx",
+ [BPF_ST] = "st",
+ [BPF_STX] = "stx",
+ [BPF_ALU] = "alu",
+ [BPF_JMP] = "jmp",
+ [BPF_RET] = "BUG",
+ [BPF_ALU64] = "alu64",
+};
+
+static const char *const bpf_alu_string[] = {
+ [BPF_ADD >> 4] = "+=",
+ [BPF_SUB >> 4] = "-=",
+ [BPF_MUL >> 4] = "*=",
+ [BPF_DIV >> 4] = "/=",
+ [BPF_OR >> 4] = "|=",
+ [BPF_AND >> 4] = "&=",
+ [BPF_LSH >> 4] = "<<=",
+ [BPF_RSH >> 4] = ">>=",
+ [BPF_NEG >> 4] = "neg",
+ [BPF_MOD >> 4] = "%=",
+ [BPF_XOR >> 4] = "^=",
+ [BPF_MOV >> 4] = "=",
+ [BPF_ARSH >> 4] = "s>>=",
+ [BPF_END >> 4] = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+ [BPF_W >> 3] = "u32",
+ [BPF_H >> 3] = "u16",
+ [BPF_B >> 3] = "u8",
+ [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[] = {
+ [BPF_JA >> 4] = "jmp",
+ [BPF_JEQ >> 4] = "==",
+ [BPF_JGT >> 4] = ">",
+ [BPF_JGE >> 4] = ">=",
+ [BPF_JSET >> 4] = "&",
+ [BPF_JNE >> 4] = "!=",
+ [BPF_JSGT >> 4] = "s>",
+ [BPF_JSGE >> 4] = "s>=",
+ [BPF_CALL >> 4] = "call",
+ [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_insn(struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (BPF_SRC(insn->code) == BPF_X)
+ verbose("(%02x) %sr%d %s %sr%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->src_reg);
+ else
+ verbose("(%02x) %sr%d %s %s%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->imm);
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->src_reg);
+ else if (BPF_MODE(insn->code) == BPF_XADD)
+ verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ insn->src_reg);
+ else
+ verbose("BUG_%02x\n", insn->code);
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_st_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_ldx_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (class == BPF_LD) {
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IND) {
+ verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM) {
+ verbose("(%02x) r%d = 0x%x\n",
+ insn->code, insn->dst_reg, insn->imm);
+ } else {
+ verbose("BUG_ld_%02x\n", insn->code);
+ return;
+ }
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ verbose("(%02x) call %d\n", insn->code, insn->imm);
+ } else if (insn->code == (BPF_JMP | BPF_JA)) {
+ verbose("(%02x) goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+ verbose("(%02x) exit\n", insn->code);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->src_reg, insn->off);
+ } else {
+ verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->imm, insn->off);
+ }
+ } else {
+ verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+ }
+}
+
+static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
+{
+ struct verifier_stack_elem *elem;
+ int insn_idx;
+
+ if (env->head == NULL)
+ return -1;
+
+ memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
+ insn_idx = env->head->insn_idx;
+ if (prev_insn_idx)
+ *prev_insn_idx = env->head->prev_insn_idx;
+ elem = env->head->next;
+ kfree(env->head);
+ env->head = elem;
+ env->stack_size--;
+ return insn_idx;
+}
+
+static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
+ int prev_insn_idx)
+{
+ struct verifier_stack_elem *elem;
+
+ elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+ if (!elem)
+ goto err;
+
+ memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
+ elem->insn_idx = insn_idx;
+ elem->prev_insn_idx = prev_insn_idx;
+ elem->next = env->head;
+ env->head = elem;
+ env->stack_size++;
+ if (env->stack_size > 1024) {
+ verbose("BPF program is too complex\n");
+ goto err;
+ }
+ return &elem->st;
+err:
+ /* pop all elements and return */
+ while (pop_stack(env, NULL) >= 0);
+ return NULL;
+}
+
+#define CALLER_SAVED_REGS 6
+static const int caller_saved[CALLER_SAVED_REGS] = {
+ BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
+};
+
+static void init_reg_state(struct reg_state *regs)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ regs[i].type = NOT_INIT;
+ regs[i].imm = 0;
+ regs[i].map_ptr = NULL;
+ }
+
+ /* frame pointer */
+ regs[BPF_REG_FP].type = FRAME_PTR;
+
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+}
+
+static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
+{
+ BUG_ON(regno >= MAX_BPF_REG);
+ regs[regno].type = UNKNOWN_VALUE;
+ regs[regno].imm = 0;
+ regs[regno].map_ptr = NULL;
+}
+
+enum reg_arg_type {
+ SRC_OP, /* register is used as source operand */
+ DST_OP, /* register is used as destination operand */
+ DST_OP_NO_MARK /* same as above, check only, don't mark */
+};
+
+static int check_reg_arg(struct reg_state *regs, u32 regno,
+ enum reg_arg_type t)
+{
+ if (regno >= MAX_BPF_REG) {
+ verbose("R%d is invalid\n", regno);
+ return -EINVAL;
+ }
+
+ if (t == SRC_OP) {
+ /* check whether register used as source operand can be read */
+ if (regs[regno].type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+ } else {
+ /* check whether register used as dest operand can be written to */
+ if (regno == BPF_REG_FP) {
+ verbose("frame pointer is read only\n");
+ return -EACCES;
+ }
+ if (t == DST_OP)
+ mark_reg_unknown_value(regs, regno);
+ }
+ return 0;
+}
+
+static int bpf_size_to_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 8;
+ else
+ return -EINVAL;
+}
+
+/* check_stack_read/write functions track spill/fill of registers,
+ * stack boundary and alignment are checked in check_mem_access()
+ */
+static int check_stack_write(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ int i;
+ /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
+ * so it's aligned access and [off, off + size) are within stack limits
+ */
+
+ if (value_regno >= 0 &&
+ (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
+ state->regs[value_regno].type == PTR_TO_STACK ||
+ state->regs[value_regno].type == PTR_TO_CTX)) {
+
+ /* register containing pointer is being spilled into stack */
+ if (size != BPF_REG_SIZE) {
+ verbose("invalid size of register spill\n");
+ return -EACCES;
+ }
+
+ /* save register state */
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
+ state->regs[value_regno];
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+ } else {
+ /* regular write of data into stack */
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
+ (struct reg_state) {};
+
+ for (i = 0; i < size; i++)
+ state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
+ }
+ return 0;
+}
+
+static int check_stack_read(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ u8 *slot_type;
+ int i;
+
+ slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
+
+ if (slot_type[0] == STACK_SPILL) {
+ if (size != BPF_REG_SIZE) {
+ verbose("invalid size of register spill\n");
+ return -EACCES;
+ }
+ for (i = 1; i < BPF_REG_SIZE; i++) {
+ if (slot_type[i] != STACK_SPILL) {
+ verbose("corrupted spill memory\n");
+ return -EACCES;
+ }
+ }
+
+ if (value_regno >= 0)
+ /* restore register state from stack */
+ state->regs[value_regno] =
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
+ return 0;
+ } else {
+ for (i = 0; i < size; i++) {
+ if (slot_type[i] != STACK_MISC) {
+ verbose("invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
+ }
+ if (value_regno >= 0)
+ /* have read misc data from the stack */
+ mark_reg_unknown_value(state->regs, value_regno);
+ return 0;
+ }
+}
+
+/* check read/write into map element returned by bpf_map_lookup_elem() */
+static int check_map_access(struct verifier_env *env, u32 regno, int off,
+ int size)
+{
+ struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+
+ if (off < 0 || off + size > map->value_size) {
+ verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+ map->value_size, off, size);
+ return -EACCES;
+ }
+ return 0;
+}
+
+/* check access to 'struct bpf_context' fields */
+static int check_ctx_access(struct verifier_env *env, int off, int size,
+ enum bpf_access_type t)
+{
+ if (env->prog->aux->ops->is_valid_access &&
+ env->prog->aux->ops->is_valid_access(off, size, t))
+ return 0;
+
+ verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+ return -EACCES;
+}
+
+/* check whether memory at (regno + off) is accessible for t = (read | write)
+ * if t==write, value_regno is a register which value is stored into memory
+ * if t==read, value_regno is a register which will receive the value from memory
+ * if t==write && value_regno==-1, some unknown value is stored into memory
+ * if t==read && value_regno==-1, don't care what we read from memory
+ */
+static int check_mem_access(struct verifier_env *env, u32 regno, int off,
+ int bpf_size, enum bpf_access_type t,
+ int value_regno)
+{
+ struct verifier_state *state = &env->cur_state;
+ int size, err = 0;
+
+ size = bpf_size_to_bytes(bpf_size);
+ if (size < 0)
+ return size;
+
+ if (off % size != 0) {
+ verbose("misaligned access off %d size %d\n", off, size);
+ return -EACCES;
+ }
+
+ if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+ err = check_map_access(env, regno, off, size);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
+
+ } else if (state->regs[regno].type == PTR_TO_CTX) {
+ err = check_ctx_access(env, off, size, t);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
+
+ } else if (state->regs[regno].type == FRAME_PTR) {
+ if (off >= 0 || off < -MAX_BPF_STACK) {
+ verbose("invalid stack off=%d size=%d\n", off, size);
+ return -EACCES;
+ }
+ if (t == BPF_WRITE)
+ err = check_stack_write(state, off, size, value_regno);
+ else
+ err = check_stack_read(state, off, size, value_regno);
+ } else {
+ verbose("R%d invalid mem access '%s'\n",
+ regno, reg_type_str[state->regs[regno].type]);
+ return -EACCES;
+ }
+ return err;
+}
+
+static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ int err;
+
+ if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
+ insn->imm != 0) {
+ verbose("BPF_XADD uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check whether atomic_add can read the memory */
+ err = check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, -1);
+ if (err)
+ return err;
+
+ /* check whether atomic_add can write into the same memory */
+ return check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, -1);
+}
+
+/* when register 'regno' is passed into function that will read 'access_size'
+ * bytes from that pointer, make sure that it's within stack boundary
+ * and all elements of stack are initialized
+ */
+static int check_stack_boundary(struct verifier_env *env,
+ int regno, int access_size)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct reg_state *regs = state->regs;
+ int off, i;
+
+ if (regs[regno].type != PTR_TO_STACK)
+ return -EACCES;
+
+ off = regs[regno].imm;
+ if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+ access_size <= 0) {
+ verbose("invalid stack type R%d off=%d access_size=%d\n",
+ regno, off, access_size);
+ return -EACCES;
+ }
+
+ for (i = 0; i < access_size; i++) {
+ if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
+ verbose("invalid indirect read from stack off %d+%d size %d\n",
+ off, i, access_size);
+ return -EACCES;
+ }
+ }
+ return 0;
+}
+
+static int check_func_arg(struct verifier_env *env, u32 regno,
+ enum bpf_arg_type arg_type, struct bpf_map **mapp)
+{
+ struct reg_state *reg = env->cur_state.regs + regno;
+ enum bpf_reg_type expected_type;
+ int err = 0;
+
+ if (arg_type == ARG_ANYTHING)
+ return 0;
+
+ if (reg->type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
+ arg_type == ARG_PTR_TO_MAP_VALUE) {
+ expected_type = PTR_TO_STACK;
+ } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ expected_type = CONST_IMM;
+ } else if (arg_type == ARG_CONST_MAP_PTR) {
+ expected_type = CONST_PTR_TO_MAP;
+ } else {
+ verbose("unsupported arg_type %d\n", arg_type);
+ return -EFAULT;
+ }
+
+ if (reg->type != expected_type) {
+ verbose("R%d type=%s expected=%s\n", regno,
+ reg_type_str[reg->type], reg_type_str[expected_type]);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_CONST_MAP_PTR) {
+ /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+ *mapp = reg->map_ptr;
+
+ } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ /* bpf_map_xxx(..., map_ptr, ..., key) call:
+ * check that [key, key + map->key_size) are within
+ * stack limits and initialized
+ */
+ if (!*mapp) {
+ /* in function declaration map_ptr must come before
+ * map_key, so that it's verified and known before
+ * we have to check map_key here. Otherwise it means
+ * that kernel subsystem misconfigured verifier
+ */
+ verbose("invalid map_ptr to access map->key\n");
+ return -EACCES;
+ }
+ err = check_stack_boundary(env, regno, (*mapp)->key_size);
+
+ } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
+ /* bpf_map_xxx(..., map_ptr, ..., value) call:
+ * check [value, value + map->value_size) validity
+ */
+ if (!*mapp) {
+ /* kernel subsystem misconfigured verifier */
+ verbose("invalid map_ptr to access map->value\n");
+ return -EACCES;
+ }
+ err = check_stack_boundary(env, regno, (*mapp)->value_size);
+
+ } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ /* bpf_xxx(..., buf, len) call will access 'len' bytes
+ * from stack pointer 'buf'. Check it
+ * note: regno == len, regno - 1 == buf
+ */
+ if (regno == 0) {
+ /* kernel subsystem misconfigured verifier */
+ verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
+ return -EACCES;
+ }
+ err = check_stack_boundary(env, regno - 1, reg->imm);
+ }
+
+ return err;
+}
+
+static int check_call(struct verifier_env *env, int func_id)
+{
+ struct verifier_state *state = &env->cur_state;
+ const struct bpf_func_proto *fn = NULL;
+ struct reg_state *regs = state->regs;
+ struct bpf_map *map = NULL;
+ struct reg_state *reg;
+ int i, err;
+
+ /* find function prototype */
+ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
+ verbose("invalid func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ if (env->prog->aux->ops->get_func_proto)
+ fn = env->prog->aux->ops->get_func_proto(func_id);
+
+ if (!fn) {
+ verbose("unknown func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) {
+ verbose("cannot call GPL only function from proprietary program\n");
+ return -EINVAL;
+ }
+
+ /* check args */
+ err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map);
+ if (err)
+ return err;
+
+ /* reset caller saved regs */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ reg = regs + caller_saved[i];
+ reg->type = NOT_INIT;
+ reg->imm = 0;
+ }
+
+ /* update return register */
+ if (fn->ret_type == RET_INTEGER) {
+ regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ } else if (fn->ret_type == RET_VOID) {
+ regs[BPF_REG_0].type = NOT_INIT;
+ } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ /* remember map_ptr, so that check_map_access()
+ * can check 'value_size' boundary of memory access
+ * to map element returned from bpf_map_lookup_elem()
+ */
+ if (map == NULL) {
+ verbose("kernel subsystem misconfigured verifier\n");
+ return -EINVAL;
+ }
+ regs[BPF_REG_0].map_ptr = map;
+ } else {
+ verbose("unknown return type %d of func %d\n",
+ fn->ret_type, func_id);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* check validity of 32-bit and 64-bit arithmetic operations */
+static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+{
+ u8 opcode = BPF_OP(insn->code);
+ int err;
+
+ if (opcode == BPF_END || opcode == BPF_NEG) {
+ if (opcode == BPF_NEG) {
+ if (BPF_SRC(insn->code) != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->off != 0 || insn->imm != 0) {
+ verbose("BPF_NEG uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
+ verbose("BPF_END uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ } else if (opcode == BPF_MOV) {
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (BPF_CLASS(insn->code) == BPF_ALU64) {
+ /* case: R1 = R2
+ * copy register state to dest reg
+ */
+ regs[insn->dst_reg] = regs[insn->src_reg];
+ } else {
+ regs[insn->dst_reg].type = UNKNOWN_VALUE;
+ regs[insn->dst_reg].map_ptr = NULL;
+ }
+ } else {
+ /* case: R = imm
+ * remember the value we stored into this reg
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+
+ } else if (opcode > BPF_END) {
+ verbose("invalid BPF_ALU opcode %x\n", opcode);
+ return -EINVAL;
+
+ } else { /* all other ALU ops: and, sub, xor, add, ... */
+
+ bool stack_relative = false;
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
+ BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
+ verbose("div by zero\n");
+ return -EINVAL;
+ }
+
+ /* pattern match 'bpf_add Rx, imm' instruction */
+ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
+ regs[insn->dst_reg].type == FRAME_PTR &&
+ BPF_SRC(insn->code) == BPF_K)
+ stack_relative = true;
+
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ if (stack_relative) {
+ regs[insn->dst_reg].type = PTR_TO_STACK;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+
+ return 0;
+}
+
+static int check_cond_jmp_op(struct verifier_env *env,
+ struct bpf_insn *insn, int *insn_idx)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ struct verifier_state *other_branch;
+ u8 opcode = BPF_OP(insn->code);
+ int err;
+
+ if (opcode > BPF_EXIT) {
+ verbose("invalid BPF_JMP opcode %x\n", opcode);
+ return -EINVAL;
+ }
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ } else {
+ if (insn->src_reg != BPF_REG_0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* detect if R == 0 where R was initialized to zero earlier */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == CONST_IMM &&
+ regs[insn->dst_reg].imm == insn->imm) {
+ if (opcode == BPF_JEQ) {
+ /* if (imm == imm) goto pc+off;
+ * only follow the goto, ignore fall-through
+ */
+ *insn_idx += insn->off;
+ return 0;
+ } else {
+ /* if (imm != imm) goto pc+off;
+ * only follow fall-through branch, since
+ * that's where the program will go
+ */
+ return 0;
+ }
+ }
+
+ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
+ if (!other_branch)
+ return -EFAULT;
+
+ /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ insn->imm == 0 && (opcode == BPF_JEQ ||
+ opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
+ if (opcode == BPF_JEQ) {
+ /* next fallthrough insn can access memory via
+ * this register
+ */
+ regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ /* branch targer cannot access it, since reg == 0 */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = 0;
+ } else {
+ other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = 0;
+ }
+ } else if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE)) {
+
+ if (opcode == BPF_JEQ) {
+ /* detect if (R == imm) goto
+ * and in the target state recognize that R = imm
+ */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = insn->imm;
+ } else {
+ /* detect if (R != imm) goto
+ * and in the fall-through state recognize that R = imm
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+ if (log_level)
+ print_verifier_state(env);
+ return 0;
+}
+
+/* return the map pointer stored inside BPF_LD_IMM64 instruction */
+static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
+{
+ u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
+
+ return (struct bpf_map *) (unsigned long) imm64;
+}
+
+/* verify BPF_LD_IMM64 instruction */
+static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ int err;
+
+ if (BPF_SIZE(insn->code) != BPF_DW) {
+ verbose("invalid BPF_LD_IMM insn\n");
+ return -EINVAL;
+ }
+ if (insn->off != 0) {
+ verbose("BPF_LD_IMM64 uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ if (insn->src_reg == 0)
+ /* generic move 64-bit immediate into a register */
+ return 0;
+
+ /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
+ BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
+
+ regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+ regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
+ return 0;
+}
+
+/* verify safety of LD_ABS|LD_IND instructions:
+ * - they can only appear in the programs where ctx == skb
+ * - since they are wrappers of function calls, they scratch R1-R5 registers,
+ * preserve R6-R9, and store return value into R0
+ *
+ * Implicit input:
+ * ctx == skb == R6 == CTX
+ *
+ * Explicit input:
+ * SRC == any register
+ * IMM == 32-bit immediate
+ *
+ * Output:
+ * R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ u8 mode = BPF_MODE(insn->code);
+ struct reg_state *reg;
+ int i, err;
+
+ if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
+ verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n");
+ return -EINVAL;
+ }
+
+ if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+ (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
+ verbose("BPF_LD_ABS uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check whether implicit source operand (register R6) is readable */
+ err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+ if (err)
+ return err;
+
+ if (regs[BPF_REG_6].type != PTR_TO_CTX) {
+ verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+ return -EINVAL;
+ }
+
+ if (mode == BPF_IND) {
+ /* check explicit source operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ }
+
+ /* reset caller saved regs to unreadable */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ reg = regs + caller_saved[i];
+ reg->type = NOT_INIT;
+ reg->imm = 0;
+ }
+
+ /* mark destination R0 register as readable, since it contains
+ * the value fetched from the packet
+ */
+ regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ return 0;
+}
+
+/* non-recursive DFS pseudo code
+ * 1 procedure DFS-iterative(G,v):
+ * 2 label v as discovered
+ * 3 let S be a stack
+ * 4 S.push(v)
+ * 5 while S is not empty
+ * 6 t <- S.pop()
+ * 7 if t is what we're looking for:
+ * 8 return t
+ * 9 for all edges e in G.adjacentEdges(t) do
+ * 10 if edge e is already labelled
+ * 11 continue with the next edge
+ * 12 w <- G.adjacentVertex(t,e)
+ * 13 if vertex w is not discovered and not explored
+ * 14 label e as tree-edge
+ * 15 label w as discovered
+ * 16 S.push(w)
+ * 17 continue at 5
+ * 18 else if vertex w is discovered
+ * 19 label e as back-edge
+ * 20 else
+ * 21 // vertex w is explored
+ * 22 label e as forward- or cross-edge
+ * 23 label t as explored
+ * 24 S.pop()
+ *
+ * convention:
+ * 0x10 - discovered
+ * 0x11 - discovered and fall-through edge labelled
+ * 0x12 - discovered and fall-through and branch edges labelled
+ * 0x20 - explored
+ */
+
+enum {
+ DISCOVERED = 0x10,
+ EXPLORED = 0x20,
+ FALLTHROUGH = 1,
+ BRANCH = 2,
+};
+
+#define STATE_LIST_MARK ((struct verifier_state_list *) -1L)
+
+static int *insn_stack; /* stack of insns to process */
+static int cur_stack; /* current stack index */
+static int *insn_state;
+
+/* t, w, e - match pseudo-code above:
+ * t - index of current instruction
+ * w - next instruction
+ * e - edge
+ */
+static int push_insn(int t, int w, int e, struct verifier_env *env)
+{
+ if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
+ return 0;
+
+ if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
+ return 0;
+
+ if (w < 0 || w >= env->prog->len) {
+ verbose("jump out of range from insn %d to %d\n", t, w);
+ return -EINVAL;
+ }
+
+ if (e == BRANCH)
+ /* mark branch target for state pruning */
+ env->explored_states[w] = STATE_LIST_MARK;
+
+ if (insn_state[w] == 0) {
+ /* tree-edge */
+ insn_state[t] = DISCOVERED | e;
+ insn_state[w] = DISCOVERED;
+ if (cur_stack >= env->prog->len)
+ return -E2BIG;
+ insn_stack[cur_stack++] = w;
+ return 1;
+ } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+ verbose("back-edge from insn %d to %d\n", t, w);
+ return -EINVAL;
+ } else if (insn_state[w] == EXPLORED) {
+ /* forward- or cross-edge */
+ insn_state[t] = DISCOVERED | e;
+ } else {
+ verbose("insn state internal bug\n");
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/* non-recursive depth-first-search to detect loops in BPF program
+ * loop == back-edge in directed graph
+ */
+static int check_cfg(struct verifier_env *env)
+{
+ struct bpf_insn *insns = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int ret = 0;
+ int i, t;
+
+ insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ if (!insn_state)
+ return -ENOMEM;
+
+ insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ if (!insn_stack) {
+ kfree(insn_state);
+ return -ENOMEM;
+ }
+
+ insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
+ insn_stack[0] = 0; /* 0 is the first instruction */
+ cur_stack = 1;
+
+peek_stack:
+ if (cur_stack == 0)
+ goto check_state;
+ t = insn_stack[cur_stack - 1];
+
+ if (BPF_CLASS(insns[t].code) == BPF_JMP) {
+ u8 opcode = BPF_OP(insns[t].code);
+
+ if (opcode == BPF_EXIT) {
+ goto mark_explored;
+ } else if (opcode == BPF_CALL) {
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insns[t].code) != BPF_K) {
+ ret = -EINVAL;
+ goto err_free;
+ }
+ /* unconditional jump with single edge */
+ ret = push_insn(t, t + insns[t].off + 1,
+ FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ /* tell verifier to check for equivalent states
+ * after every call and jump
+ */
+ env->explored_states[t + 1] = STATE_LIST_MARK;
+ } else {
+ /* conditional jump with two edges */
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+
+ ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ }
+ } else {
+ /* all other non-branch instructions with single
+ * fall-through edge
+ */
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ }
+
+mark_explored:
+ insn_state[t] = EXPLORED;
+ if (cur_stack-- <= 0) {
+ verbose("pop stack internal bug\n");
+ ret = -EFAULT;
+ goto err_free;
+ }
+ goto peek_stack;
+
+check_state:
+ for (i = 0; i < insn_cnt; i++) {
+ if (insn_state[i] != EXPLORED) {
+ verbose("unreachable insn %d\n", i);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ }
+ ret = 0; /* cfg looks good */
+
+err_free:
+ kfree(insn_state);
+ kfree(insn_stack);
+ return ret;
+}
+
+/* compare two verifier states
+ *
+ * all states stored in state_list are known to be valid, since
+ * verifier reached 'bpf_exit' instruction through them
+ *
+ * this function is called when verifier exploring different branches of
+ * execution popped from the state stack. If it sees an old state that has
+ * more strict register state and more strict stack state then this execution
+ * branch doesn't need to be explored further, since verifier already
+ * concluded that more strict state leads to valid finish.
+ *
+ * Therefore two states are equivalent if register state is more conservative
+ * and explored stack state is more conservative than the current one.
+ * Example:
+ * explored current
+ * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
+ * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
+ *
+ * In other words if current stack state (one being explored) has more
+ * valid slots than old one that already passed validation, it means
+ * the verifier can stop exploring and conclude that current state is valid too
+ *
+ * Similarly with registers. If explored state has register type as invalid
+ * whereas register type in current state is meaningful, it means that
+ * the current state will reach 'bpf_exit' instruction safely
+ */
+static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ if (memcmp(&old->regs[i], &cur->regs[i],
+ sizeof(old->regs[0])) != 0) {
+ if (old->regs[i].type == NOT_INIT ||
+ (old->regs[i].type == UNKNOWN_VALUE &&
+ cur->regs[i].type != NOT_INIT))
+ continue;
+ return false;
+ }
+ }
+
+ for (i = 0; i < MAX_BPF_STACK; i++) {
+ if (old->stack_slot_type[i] == STACK_INVALID)
+ continue;
+ if (old->stack_slot_type[i] != cur->stack_slot_type[i])
+ /* Ex: old explored (safe) state has STACK_SPILL in
+ * this stack slot, but current has has STACK_MISC ->
+ * this verifier states are not equivalent,
+ * return false to continue verification of this path
+ */
+ return false;
+ if (i % BPF_REG_SIZE)
+ continue;
+ if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
+ &cur->spilled_regs[i / BPF_REG_SIZE],
+ sizeof(old->spilled_regs[0])))
+ /* when explored and current stack slot types are
+ * the same, check that stored pointers types
+ * are the same as well.
+ * Ex: explored safe path could have stored
+ * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
+ * but current path has stored:
+ * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
+ * such verifier states are not equivalent.
+ * return false to continue verification of this path
+ */
+ return false;
+ else
+ continue;
+ }
+ return true;
+}
+
+static int is_state_visited(struct verifier_env *env, int insn_idx)
+{
+ struct verifier_state_list *new_sl;
+ struct verifier_state_list *sl;
+
+ sl = env->explored_states[insn_idx];
+ if (!sl)
+ /* this 'insn_idx' instruction wasn't marked, so we will not
+ * be doing state search here
+ */
+ return 0;
+
+ while (sl != STATE_LIST_MARK) {
+ if (states_equal(&sl->state, &env->cur_state))
+ /* reached equivalent register/stack state,
+ * prune the search
+ */
+ return 1;
+ sl = sl->next;
+ }
+
+ /* there were no equivalent states, remember current one.
+ * technically the current state is not proven to be safe yet,
+ * but it will either reach bpf_exit (which means it's safe) or
+ * it will be rejected. Since there are no loops, we won't be
+ * seeing this 'insn_idx' instruction again on the way to bpf_exit
+ */
+ new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER);
+ if (!new_sl)
+ return -ENOMEM;
+
+ /* add new state to the head of linked list */
+ memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+ new_sl->next = env->explored_states[insn_idx];
+ env->explored_states[insn_idx] = new_sl;
+ return 0;
+}
+
+static int do_check(struct verifier_env *env)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct reg_state *regs = state->regs;
+ int insn_cnt = env->prog->len;
+ int insn_idx, prev_insn_idx = 0;
+ int insn_processed = 0;
+ bool do_print_state = false;
+
+ init_reg_state(regs);
+ insn_idx = 0;
+ for (;;) {
+ struct bpf_insn *insn;
+ u8 class;
+ int err;
+
+ if (insn_idx >= insn_cnt) {
+ verbose("invalid insn idx %d insn_cnt %d\n",
+ insn_idx, insn_cnt);
+ return -EFAULT;
+ }
+
+ insn = &insns[insn_idx];
+ class = BPF_CLASS(insn->code);
+
+ if (++insn_processed > 32768) {
+ verbose("BPF program is too large. Proccessed %d insn\n",
+ insn_processed);
+ return -E2BIG;
+ }
+
+ err = is_state_visited(env, insn_idx);
+ if (err < 0)
+ return err;
+ if (err == 1) {
+ /* found equivalent state, can prune the search */
+ if (log_level) {
+ if (do_print_state)
+ verbose("\nfrom %d to %d: safe\n",
+ prev_insn_idx, insn_idx);
+ else
+ verbose("%d: safe\n", insn_idx);
+ }
+ goto process_bpf_exit;
+ }
+
+ if (log_level && do_print_state) {
+ verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
+ print_verifier_state(env);
+ do_print_state = false;
+ }
+
+ if (log_level) {
+ verbose("%d: ", insn_idx);
+ print_bpf_insn(insn);
+ }
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ err = check_alu_op(regs, insn);
+ if (err)
+ return err;
+
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->imm != 0) {
+ verbose("BPF_LDX uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ if (err)
+ return err;
+
+ /* check that memory (src_reg + off) is readable,
+ * the state of dst_reg will be updated by this func
+ */
+ err = check_mem_access(env, insn->src_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ,
+ insn->dst_reg);
+ if (err)
+ return err;
+
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_XADD) {
+ err = check_xadd(env, insn);
+ if (err)
+ return err;
+ insn_idx++;
+ continue;
+ }
+
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->imm != 0) {
+ verbose("BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check that memory (dst_reg + off) is writeable */
+ err = check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ insn->src_reg);
+ if (err)
+ return err;
+
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->src_reg != BPF_REG_0) {
+ verbose("BPF_ST uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check that memory (dst_reg + off) is writeable */
+ err = check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ -1);
+ if (err)
+ return err;
+
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->off != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_CALL uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ err = check_call(env, insn->imm);
+ if (err)
+ return err;
+
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_JA uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ insn_idx += insn->off + 1;
+ continue;
+
+ } else if (opcode == BPF_EXIT) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_EXIT uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* eBPF calling convetion is such that R0 is used
+ * to return the value from eBPF program.
+ * Make sure that it's readable at this time
+ * of bpf_exit, which means that program wrote
+ * something into it earlier
+ */
+ err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
+ if (err)
+ return err;
+
+process_bpf_exit:
+ insn_idx = pop_stack(env, &prev_insn_idx);
+ if (insn_idx < 0) {
+ break;
+ } else {
+ do_print_state = true;
+ continue;
+ }
+ } else {
+ err = check_cond_jmp_op(env, insn, &insn_idx);
+ if (err)
+ return err;
+ }
+ } else if (class == BPF_LD) {
+ u8 mode = BPF_MODE(insn->code);
+
+ if (mode == BPF_ABS || mode == BPF_IND) {
+ err = check_ld_abs(env, insn);
+ if (err)
+ return err;
+
+ } else if (mode == BPF_IMM) {
+ err = check_ld_imm(env, insn);
+ if (err)
+ return err;
+
+ insn_idx++;
+ } else {
+ verbose("invalid BPF_LD mode\n");
+ return -EINVAL;
+ }
+ } else {
+ verbose("unknown insn class %d\n", class);
+ return -EINVAL;
+ }
+
+ insn_idx++;
+ }
+
+ return 0;
+}
+
+/* look for pseudo eBPF instructions that access map FDs and
+ * replace them with actual map pointers
+ */
+static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i, j;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
+ struct bpf_map *map;
+ struct fd f;
+
+ if (i == insn_cnt - 1 || insn[1].code != 0 ||
+ insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
+ insn[1].off != 0) {
+ verbose("invalid bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ if (insn->src_reg == 0)
+ /* valid generic load 64-bit imm */
+ goto next_insn;
+
+ if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
+ verbose("unrecognized bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ f = fdget(insn->imm);
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map)) {
+ verbose("fd %d is not pointing to valid bpf_map\n",
+ insn->imm);
+ fdput(f);
+ return PTR_ERR(map);
+ }
+
+ /* store map pointer inside BPF_LD_IMM64 instruction */
+ insn[0].imm = (u32) (unsigned long) map;
+ insn[1].imm = ((u64) (unsigned long) map) >> 32;
+
+ /* check whether we recorded this map already */
+ for (j = 0; j < env->used_map_cnt; j++)
+ if (env->used_maps[j] == map) {
+ fdput(f);
+ goto next_insn;
+ }
+
+ if (env->used_map_cnt >= MAX_USED_MAPS) {
+ fdput(f);
+ return -E2BIG;
+ }
+
+ /* remember this map */
+ env->used_maps[env->used_map_cnt++] = map;
+
+ /* hold the map. If the program is rejected by verifier,
+ * the map will be released by release_maps() or it
+ * will be used by the valid program until it's unloaded
+ * and all maps are released in free_bpf_prog_info()
+ */
+ atomic_inc(&map->refcnt);
+
+ fdput(f);
+next_insn:
+ insn++;
+ i++;
+ }
+ }
+
+ /* now all pseudo BPF_LD_IMM64 instructions load valid
+ * 'struct bpf_map *' into a register instead of user map_fd.
+ * These pointers will be used later by verifier to validate map access.
+ */
+ return 0;
+}
+
+/* drop refcnt of maps used by the rejected program */
+static void release_maps(struct verifier_env *env)
+{
+ int i;
+
+ for (i = 0; i < env->used_map_cnt; i++)
+ bpf_map_put(env->used_maps[i]);
+}
+
+/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
+static void convert_pseudo_ld_imm64(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++)
+ if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
+ insn->src_reg = 0;
+}
+
+static void free_states(struct verifier_env *env)
+{
+ struct verifier_state_list *sl, *sln;
+ int i;
+
+ if (!env->explored_states)
+ return;
+
+ for (i = 0; i < env->prog->len; i++) {
+ sl = env->explored_states[i];
+
+ if (sl)
+ while (sl != STATE_LIST_MARK) {
+ sln = sl->next;
+ kfree(sl);
+ sl = sln;
+ }
+ }
+
+ kfree(env->explored_states);
+}
+
+int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ char __user *log_ubuf = NULL;
+ struct verifier_env *env;
+ int ret = -EINVAL;
+
+ if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
+ return -E2BIG;
+
+ /* 'struct verifier_env' can be global, but since it's not small,
+ * allocate/free it every time bpf_check() is called
+ */
+ env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+ if (!env)
+ return -ENOMEM;
+
+ env->prog = prog;
+
+ /* grab the mutex to protect few globals used by verifier */
+ mutex_lock(&bpf_verifier_lock);
+
+ if (attr->log_level || attr->log_buf || attr->log_size) {
+ /* user requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ log_level = attr->log_level;
+ log_ubuf = (char __user *) (unsigned long) attr->log_buf;
+ log_size = attr->log_size;
+ log_len = 0;
+
+ ret = -EINVAL;
+ /* log_* values have to be sane */
+ if (log_size < 128 || log_size > UINT_MAX >> 8 ||
+ log_level == 0 || log_ubuf == NULL)
+ goto free_env;
+
+ ret = -ENOMEM;
+ log_buf = vmalloc(log_size);
+ if (!log_buf)
+ goto free_env;
+ } else {
+ log_level = 0;
+ }
+
+ ret = replace_map_fd_with_map_ptr(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ env->explored_states = kcalloc(prog->len,
+ sizeof(struct verifier_state_list *),
+ GFP_USER);
+ ret = -ENOMEM;
+ if (!env->explored_states)
+ goto skip_full_check;
+
+ ret = check_cfg(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = do_check(env);
+
+skip_full_check:
+ while (pop_stack(env, NULL) >= 0);
+ free_states(env);
+
+ if (log_level && log_len >= log_size - 1) {
+ BUG_ON(log_len >= log_size);
+ /* verifier log exceeded user supplied buffer */
+ ret = -ENOSPC;
+ /* fall through to return what was recorded */
+ }
+
+ /* copy verifier log back to user space including trailing zero */
+ if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+ ret = -EFAULT;
+ goto free_log_buf;
+ }
+
+ if (ret == 0 && env->used_map_cnt) {
+ /* if program passed verifier, update used_maps in bpf_prog_info */
+ prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
+ sizeof(env->used_maps[0]),
+ GFP_KERNEL);
+
+ if (!prog->aux->used_maps) {
+ ret = -ENOMEM;
+ goto free_log_buf;
+ }
+
+ memcpy(prog->aux->used_maps, env->used_maps,
+ sizeof(env->used_maps[0]) * env->used_map_cnt);
+ prog->aux->used_map_cnt = env->used_map_cnt;
+
+ /* program is valid. Convert pseudo bpf_ld_imm64 into generic
+ * bpf_ld_imm64 instructions
+ */
+ convert_pseudo_ld_imm64(env);
+ }
+
+free_log_buf:
+ if (log_level)
+ vfree(log_buf);
+free_env:
+ if (!prog->aux->used_maps)
+ /* if we didn't copy map pointers into bpf_prog_info, release
+ * them now. Otherwise free_bpf_prog_info() will release them.
+ */
+ release_maps(env);
+ kfree(env);
+ mutex_unlock(&bpf_verifier_lock);
+ return ret;
+}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3a73f995a81e..bb263d0caab3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
-static void cgroup_put(struct cgroup *cgrp);
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned int ss_mask);
static int cgroup_destroy_locked(struct cgroup *cgrp);
@@ -195,7 +194,6 @@ static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
@@ -279,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
if (!(cgrp->root->subsys_mask & (1 << ss->id)))
return NULL;
+ /*
+ * This function is used while updating css associations and thus
+ * can't test the csses directly. Use ->child_subsys_mask.
+ */
while (cgroup_parent(cgrp) &&
!(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
cgrp = cgroup_parent(cgrp);
@@ -286,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
return cgroup_css(cgrp, ss);
}
+/**
+ * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss. The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ * The returned css must be put using css_put().
+ */
+struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+
+ do {
+ css = cgroup_css(cgrp, ss);
+
+ if (css && css_tryget_online(css))
+ goto out_unlock;
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+
+ css = init_css_set.subsys[ss->id];
+ css_get(css);
+out_unlock:
+ rcu_read_unlock();
+ return css;
+}
+
/* convenient tests for these bits */
static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
@@ -331,14 +366,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
return false;
}
-static int cgroup_is_releasable(const struct cgroup *cgrp)
-{
- const int bits =
- (1 << CGRP_RELEASABLE) |
- (1 << CGRP_NOTIFY_ON_RELEASE);
- return (cgrp->flags & bits) == bits;
-}
-
static int notify_on_release(const struct cgroup *cgrp)
{
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -394,12 +421,7 @@ static int notify_on_release(const struct cgroup *cgrp)
; \
else
-/* the list of cgroups eligible for automatic release. Protected by
- * release_list_lock */
-static LIST_HEAD(release_list);
-static DEFINE_RAW_SPINLOCK(release_list_lock);
static void cgroup_release_agent(struct work_struct *work);
-static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
/*
@@ -498,7 +520,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
return key;
}
-static void put_css_set_locked(struct css_set *cset, bool taskexit)
+static void put_css_set_locked(struct css_set *cset)
{
struct cgrp_cset_link *link, *tmp_link;
struct cgroup_subsys *ss;
@@ -524,11 +546,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
/* @cgrp can't go away while we're holding css_set_rwsem */
if (list_empty(&cgrp->cset_links)) {
cgroup_update_populated(cgrp, false);
- if (notify_on_release(cgrp)) {
- if (taskexit)
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
- }
+ check_for_release(cgrp);
}
kfree(link);
@@ -537,7 +555,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
kfree_rcu(cset, rcu_head);
}
-static void put_css_set(struct css_set *cset, bool taskexit)
+static void put_css_set(struct css_set *cset)
{
/*
* Ensure that the refcount doesn't hit zero while any readers
@@ -548,7 +566,7 @@ static void put_css_set(struct css_set *cset, bool taskexit)
return;
down_write(&css_set_rwsem);
- put_css_set_locked(cset, taskexit);
+ put_css_set_locked(cset);
up_write(&css_set_rwsem);
}
@@ -969,14 +987,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* knows that the cgroup won't be removed, as cgroup_rmdir()
* needs that mutex.
*
- * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
- * (usually) take cgroup_mutex. These are the two most performance
- * critical pieces of code here. The exception occurs on cgroup_exit(),
- * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
- * is taken, and if the cgroup count is zero, a usermode call made
- * to the release agent with the name of the cgroup (path relative to
- * the root of cgroup file system) as the argument.
- *
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
@@ -1046,31 +1056,30 @@ static void cgroup_put(struct cgroup *cgrp)
}
/**
- * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
* @cgrp: the target cgroup
+ * @subtree_control: the new subtree_control mask to consider
*
* On the default hierarchy, a subsystem may request other subsystems to be
* enabled together through its ->depends_on mask. In such cases, more
* subsystems than specified in "cgroup.subtree_control" may be enabled.
*
- * This function determines which subsystems need to be enabled given the
- * current @cgrp->subtree_control and records it in
- * @cgrp->child_subsys_mask. The resulting mask is always a superset of
- * @cgrp->subtree_control and follows the usual hierarchy rules.
+ * This function calculates which subsystems need to be enabled if
+ * @subtree_control is to be applied to @cgrp. The returned mask is always
+ * a superset of @subtree_control and follows the usual hierarchy rules.
*/
-static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
+ unsigned int subtree_control)
{
struct cgroup *parent = cgroup_parent(cgrp);
- unsigned int cur_ss_mask = cgrp->subtree_control;
+ unsigned int cur_ss_mask = subtree_control;
struct cgroup_subsys *ss;
int ssid;
lockdep_assert_held(&cgroup_mutex);
- if (!cgroup_on_dfl(cgrp)) {
- cgrp->child_subsys_mask = cur_ss_mask;
- return;
- }
+ if (!cgroup_on_dfl(cgrp))
+ return cur_ss_mask;
while (true) {
unsigned int new_ss_mask = cur_ss_mask;
@@ -1094,7 +1103,20 @@ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
cur_ss_mask = new_ss_mask;
}
- cgrp->child_subsys_mask = cur_ss_mask;
+ return cur_ss_mask;
+}
+
+/**
+ * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * @cgrp: the target cgroup
+ *
+ * Update @cgrp->child_subsys_mask according to the current
+ * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
+ */
+static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+{
+ cgrp->child_subsys_mask =
+ cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
}
/**
@@ -1587,7 +1609,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->self.sibling);
INIT_LIST_HEAD(&cgrp->self.children);
INIT_LIST_HEAD(&cgrp->cset_links);
- INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
@@ -1597,6 +1618,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
init_waitqueue_head(&cgrp->offline_waitq);
+ INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
}
static void init_cgroup_root(struct cgroup_root *root,
@@ -1634,7 +1656,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
goto out;
root_cgrp->id = ret;
- ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
+ ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
+ GFP_KERNEL);
if (ret)
goto out;
@@ -2052,8 +2075,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
* task. As trading it for new_cset is protected by cgroup_mutex,
* we're safe to drop it here; it will be freed under RCU.
*/
- set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
- put_css_set_locked(old_cset, false);
+ put_css_set_locked(old_cset);
}
/**
@@ -2074,7 +2096,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
cset->mg_src_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
- put_css_set_locked(cset, false);
+ put_css_set_locked(cset);
}
up_write(&css_set_rwsem);
}
@@ -2168,8 +2190,8 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
list_del_init(&src_cset->mg_preload_node);
- put_css_set(src_cset, false);
- put_css_set(dst_cset, false);
+ put_css_set(src_cset);
+ put_css_set(dst_cset);
continue;
}
@@ -2178,7 +2200,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
if (list_empty(&dst_cset->mg_preload_node))
list_add(&dst_cset->mg_preload_node, &csets);
else
- put_css_set(dst_cset, false);
+ put_css_set(dst_cset);
}
list_splice_tail(&csets, preloaded_csets);
@@ -2668,7 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
loff_t off)
{
unsigned int enable = 0, disable = 0;
- unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
+ unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
@@ -2720,36 +2742,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
ret = -ENOENT;
goto out_unlock;
}
-
- /*
- * @ss is already enabled through dependency and
- * we'll just make it visible. Skip draining.
- */
- if (cgrp->child_subsys_mask & (1 << ssid))
- continue;
-
- /*
- * Because css offlining is asynchronous, userland
- * might try to re-enable the same controller while
- * the previous instance is still around. In such
- * cases, wait till it's gone using offline_waitq.
- */
- cgroup_for_each_live_child(child, cgrp) {
- DEFINE_WAIT(wait);
-
- if (!cgroup_css(child, ss))
- continue;
-
- cgroup_get(child);
- prepare_to_wait(&child->offline_waitq, &wait,
- TASK_UNINTERRUPTIBLE);
- cgroup_kn_unlock(of->kn);
- schedule();
- finish_wait(&child->offline_waitq, &wait);
- cgroup_put(child);
-
- return restart_syscall();
- }
} else if (disable & (1 << ssid)) {
if (!(cgrp->subtree_control & (1 << ssid))) {
disable &= ~(1 << ssid);
@@ -2785,19 +2777,48 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
* subsystems than specified may need to be enabled or disabled
* depending on subsystem dependencies.
*/
- cgrp->subtree_control |= enable;
- cgrp->subtree_control &= ~disable;
-
- old_ctrl = cgrp->child_subsys_mask;
- cgroup_refresh_child_subsys_mask(cgrp);
- new_ctrl = cgrp->child_subsys_mask;
+ old_sc = cgrp->subtree_control;
+ old_ss = cgrp->child_subsys_mask;
+ new_sc = (old_sc | enable) & ~disable;
+ new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
- css_enable = ~old_ctrl & new_ctrl;
- css_disable = old_ctrl & ~new_ctrl;
+ css_enable = ~old_ss & new_ss;
+ css_disable = old_ss & ~new_ss;
enable |= css_enable;
disable |= css_disable;
/*
+ * Because css offlining is asynchronous, userland might try to
+ * re-enable the same controller while the previous instance is
+ * still around. In such cases, wait till it's gone using
+ * offline_waitq.
+ */
+ for_each_subsys(ss, ssid) {
+ if (!(css_enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ DEFINE_WAIT(wait);
+
+ if (!cgroup_css(child, ss))
+ continue;
+
+ cgroup_get(child);
+ prepare_to_wait(&child->offline_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ cgroup_kn_unlock(of->kn);
+ schedule();
+ finish_wait(&child->offline_waitq, &wait);
+ cgroup_put(child);
+
+ return restart_syscall();
+ }
+ }
+
+ cgrp->subtree_control = new_sc;
+ cgrp->child_subsys_mask = new_ss;
+
+ /*
* Create new csses or make the existing ones visible. A css is
* created invisible if it's being implicitly enabled through
* dependency. An invisible css is made visible when the userland
@@ -2852,6 +2873,24 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
}
}
+ /*
+ * The effective csses of all the descendants (excluding @cgrp) may
+ * have changed. Subsystems can optionally subscribe to this event
+ * by implementing ->css_e_css_changed() which is invoked if any of
+ * the effective csses seen from the css's cgroup may have changed.
+ */
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
+ struct cgroup_subsys_state *css;
+
+ if (!ss->css_e_css_changed || !this_css)
+ continue;
+
+ css_for_each_descendant_pre(css, this_css)
+ if (css != this_css)
+ ss->css_e_css_changed(css);
+ }
+
kernfs_activate(cgrp->kn);
ret = 0;
out_unlock:
@@ -2859,9 +2898,8 @@ out_unlock:
return ret ?: nbytes;
err_undo_css:
- cgrp->subtree_control &= ~enable;
- cgrp->subtree_control |= disable;
- cgroup_refresh_child_subsys_mask(cgrp);
+ cgrp->subtree_control = old_sc;
+ cgrp->child_subsys_mask = old_ss;
for_each_subsys(ss, ssid) {
if (!(enable & (1 << ssid)))
@@ -4173,7 +4211,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
- clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
if (val)
set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
else
@@ -4351,6 +4388,7 @@ static void css_free_work_fn(struct work_struct *work)
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
cgroup_pidlist_destroy_all(cgrp);
+ cancel_work_sync(&cgrp->release_agent_work);
if (cgroup_parent(cgrp)) {
/*
@@ -4397,6 +4435,8 @@ static void css_release_work_fn(struct work_struct *work)
if (ss) {
/* css release path */
cgroup_idr_remove(&ss->css_idr, css->id);
+ if (ss->css_released)
+ ss->css_released(css);
} else {
/* cgroup release path */
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
@@ -4510,7 +4550,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
init_and_link_css(css, ss, cgrp);
- err = percpu_ref_init(&css->refcnt, css_release);
+ err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
if (err)
goto err_free_css;
@@ -4583,7 +4623,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
goto out_unlock;
}
- ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
+ ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
if (ret)
goto out_free_cgrp;
@@ -4813,19 +4853,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
for_each_css(css, ssid, cgrp)
kill_css(css);
- /* CSS_ONLINE is clear, remove from ->release_list for the last time */
- raw_spin_lock(&release_list_lock);
- if (!list_empty(&cgrp->release_list))
- list_del_init(&cgrp->release_list);
- raw_spin_unlock(&release_list_lock);
-
/*
* Remove @cgrp directory along with the base files. @cgrp has an
* extra ref on its kn.
*/
kernfs_remove(cgrp->kn);
- set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
check_for_release(cgroup_parent(cgrp));
/* put the base reference */
@@ -4842,13 +4875,10 @@ static int cgroup_rmdir(struct kernfs_node *kn)
cgrp = cgroup_kn_lock_live(kn);
if (!cgrp)
return 0;
- cgroup_get(cgrp); /* for @kn->priv clearing */
ret = cgroup_destroy_locked(cgrp);
cgroup_kn_unlock(kn);
-
- cgroup_put(cgrp);
return ret;
}
@@ -5052,12 +5082,9 @@ core_initcall(cgroup_wq_init);
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
*/
-
-/* TODO: Use a proper seq_file iterator */
-int proc_cgroup_show(struct seq_file *m, void *v)
+int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
{
- struct pid *pid;
- struct task_struct *tsk;
char *buf, *path;
int retval;
struct cgroup_root *root;
@@ -5067,14 +5094,6 @@ int proc_cgroup_show(struct seq_file *m, void *v)
if (!buf)
goto out;
- retval = -ESRCH;
- pid = m->private;
- tsk = get_pid_task(pid, PIDTYPE_PID);
- if (!tsk)
- goto out_free;
-
- retval = 0;
-
mutex_lock(&cgroup_mutex);
down_read(&css_set_rwsem);
@@ -5104,11 +5123,10 @@ int proc_cgroup_show(struct seq_file *m, void *v)
seq_putc(m, '\n');
}
+ retval = 0;
out_unlock:
up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
- put_task_struct(tsk);
-out_free:
kfree(buf);
out:
return retval;
@@ -5179,7 +5197,7 @@ void cgroup_post_fork(struct task_struct *child)
int i;
/*
- * This may race against cgroup_enable_task_cg_links(). As that
+ * This may race against cgroup_enable_task_cg_lists(). As that
* function sets use_task_css_set_links before grabbing
* tasklist_lock and we just went through tasklist_lock to add
* @child, it's guaranteed that either we see the set
@@ -5194,7 +5212,7 @@ void cgroup_post_fork(struct task_struct *child)
* when implementing operations which need to migrate all tasks of
* a cgroup to another.
*
- * Note that if we lose to cgroup_enable_task_cg_links(), @child
+ * Note that if we lose to cgroup_enable_task_cg_lists(), @child
* will remain in init_css_set. This is safe because all tasks are
* in the init_css_set before cg_links is enabled and there's no
* operation which transfers all tasks out of init_css_set.
@@ -5278,30 +5296,14 @@ void cgroup_exit(struct task_struct *tsk)
}
if (put_cset)
- put_css_set(cset, true);
+ put_css_set(cset);
}
static void check_for_release(struct cgroup *cgrp)
{
- if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
- !css_has_online_children(&cgrp->self)) {
- /*
- * Control Group is currently removeable. If it's not
- * already queued for a userspace notification, queue
- * it now
- */
- int need_schedule_work = 0;
-
- raw_spin_lock(&release_list_lock);
- if (!cgroup_is_dead(cgrp) &&
- list_empty(&cgrp->release_list)) {
- list_add(&cgrp->release_list, &release_list);
- need_schedule_work = 1;
- }
- raw_spin_unlock(&release_list_lock);
- if (need_schedule_work)
- schedule_work(&release_agent_work);
- }
+ if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
+ !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+ schedule_work(&cgrp->release_agent_work);
}
/*
@@ -5329,52 +5331,39 @@ static void check_for_release(struct cgroup *cgrp)
*/
static void cgroup_release_agent(struct work_struct *work)
{
- BUG_ON(work != &release_agent_work);
+ struct cgroup *cgrp =
+ container_of(work, struct cgroup, release_agent_work);
+ char *pathbuf = NULL, *agentbuf = NULL, *path;
+ char *argv[3], *envp[3];
+
mutex_lock(&cgroup_mutex);
- raw_spin_lock(&release_list_lock);
- while (!list_empty(&release_list)) {
- char *argv[3], *envp[3];
- int i;
- char *pathbuf = NULL, *agentbuf = NULL, *path;
- struct cgroup *cgrp = list_entry(release_list.next,
- struct cgroup,
- release_list);
- list_del_init(&cgrp->release_list);
- raw_spin_unlock(&release_list_lock);
- pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!pathbuf)
- goto continue_free;
- path = cgroup_path(cgrp, pathbuf, PATH_MAX);
- if (!path)
- goto continue_free;
- agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
- if (!agentbuf)
- goto continue_free;
-
- i = 0;
- argv[i++] = agentbuf;
- argv[i++] = path;
- argv[i] = NULL;
-
- i = 0;
- /* minimal command environment */
- envp[i++] = "HOME=/";
- envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[i] = NULL;
-
- /* Drop the lock while we invoke the usermode helper,
- * since the exec could involve hitting disk and hence
- * be a slow process */
- mutex_unlock(&cgroup_mutex);
- call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
- mutex_lock(&cgroup_mutex);
- continue_free:
- kfree(pathbuf);
- kfree(agentbuf);
- raw_spin_lock(&release_list_lock);
- }
- raw_spin_unlock(&release_list_lock);
+
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+ agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+ if (!pathbuf || !agentbuf)
+ goto out;
+
+ path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+ if (!path)
+ goto out;
+
+ argv[0] = agentbuf;
+ argv[1] = path;
+ argv[2] = NULL;
+
+ /* minimal command environment */
+ envp[0] = "HOME=/";
+ envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[2] = NULL;
+
mutex_unlock(&cgroup_mutex);
+ call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ goto out_free;
+out:
+ mutex_unlock(&cgroup_mutex);
+out_free:
+ kfree(agentbuf);
+ kfree(pathbuf);
}
static int __init cgroup_disable(char *str)
@@ -5562,7 +5551,8 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
+ return (!cgroup_has_tasks(css->cgroup) &&
+ !css_has_online_children(&css->cgroup->self));
}
static struct cftype debug_files[] = {
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
new file mode 100644
index 000000000000..c2de56ab0fce
--- /dev/null
+++ b/kernel/configs/tiny.config
@@ -0,0 +1,4 @@
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_KERNEL_XZ=y
+CONFIG_OPTIMIZE_INLINING=y
+CONFIG_SLOB=y
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 5664985c46a0..937ecdfdf258 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -107,46 +107,6 @@ void context_tracking_user_enter(void)
}
NOKPROBE_SYMBOL(context_tracking_user_enter);
-#ifdef CONFIG_PREEMPT
-/**
- * preempt_schedule_context - preempt_schedule called by tracing
- *
- * The tracing infrastructure uses preempt_enable_notrace to prevent
- * recursion and tracing preempt enabling caused by the tracing
- * infrastructure itself. But as tracing can happen in areas coming
- * from userspace or just about to enter userspace, a preempt enable
- * can occur before user_exit() is called. This will cause the scheduler
- * to be called when the system is still in usermode.
- *
- * To prevent this, the preempt_enable_notrace will use this function
- * instead of preempt_schedule() to exit user context if needed before
- * calling the scheduler.
- */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
-{
- enum ctx_state prev_ctx;
-
- if (likely(!preemptible()))
- return;
-
- /*
- * Need to disable preemption in case user_exit() is traced
- * and the tracer calls preempt_enable_notrace() causing
- * an infinite recursion.
- */
- preempt_disable_notrace();
- prev_ctx = exception_enter();
- preempt_enable_no_resched_notrace();
-
- preempt_schedule();
-
- preempt_disable_notrace();
- exception_exit(prev_ctx);
- preempt_enable_notrace();
-}
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_PREEMPT */
-
/**
* context_tracking_user_exit - Inform the context tracking that the CPU is
* exiting userspace mode and entering the kernel.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 81e2a388a0f6..5d220234b3ca 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ static struct {
* an ongoing cpu hotplug operation.
*/
int refcount;
+ /* And allows lockless put_online_cpus(). */
+ atomic_t puts_pending;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
@@ -79,9 +81,21 @@ static struct {
/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
+#define cpuhp_lock_acquire_tryread() \
+ lock_map_acquire_tryread(&cpu_hotplug.dep_map)
#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
+static void apply_puts_pending(int max)
+{
+ int delta;
+
+ if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
+ delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
+ cpu_hotplug.refcount -= delta;
+ }
+}
+
void get_online_cpus(void)
{
might_sleep();
@@ -89,17 +103,35 @@ void get_online_cpus(void)
return;
cpuhp_lock_acquire_read();
mutex_lock(&cpu_hotplug.lock);
+ apply_puts_pending(65536);
cpu_hotplug.refcount++;
mutex_unlock(&cpu_hotplug.lock);
-
}
EXPORT_SYMBOL_GPL(get_online_cpus);
+bool try_get_online_cpus(void)
+{
+ if (cpu_hotplug.active_writer == current)
+ return true;
+ if (!mutex_trylock(&cpu_hotplug.lock))
+ return false;
+ cpuhp_lock_acquire_tryread();
+ apply_puts_pending(65536);
+ cpu_hotplug.refcount++;
+ mutex_unlock(&cpu_hotplug.lock);
+ return true;
+}
+EXPORT_SYMBOL_GPL(try_get_online_cpus);
+
void put_online_cpus(void)
{
if (cpu_hotplug.active_writer == current)
return;
- mutex_lock(&cpu_hotplug.lock);
+ if (!mutex_trylock(&cpu_hotplug.lock)) {
+ atomic_inc(&cpu_hotplug.puts_pending);
+ cpuhp_lock_release();
+ return;
+ }
if (WARN_ON(!cpu_hotplug.refcount))
cpu_hotplug.refcount++; /* try to fix things up */
@@ -141,6 +173,7 @@ void cpu_hotplug_begin(void)
cpuhp_lock_acquire();
for (;;) {
mutex_lock(&cpu_hotplug.lock);
+ apply_puts_pending(1);
if (likely(!cpu_hotplug.refcount))
break;
__set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 52cb04c993b7..64b257f6bca2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -248,34 +248,34 @@ static struct cpuset top_cpuset = {
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
/*
- * There are two global mutexes guarding cpuset structures - cpuset_mutex
- * and callback_mutex. The latter may nest inside the former. We also
- * require taking task_lock() when dereferencing a task's cpuset pointer.
- * See "The task_lock() exception", at the end of this comment.
+ * There are two global locks guarding cpuset structures - cpuset_mutex and
+ * callback_lock. We also require taking task_lock() when dereferencing a
+ * task's cpuset pointer. See "The task_lock() exception", at the end of this
+ * comment.
*
- * A task must hold both mutexes to modify cpusets. If a task holds
+ * A task must hold both locks to modify cpusets. If a task holds
* cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
- * is the only task able to also acquire callback_mutex and be able to
+ * is the only task able to also acquire callback_lock and be able to
* modify cpusets. It can perform various checks on the cpuset structure
* first, knowing nothing will change. It can also allocate memory while
* just holding cpuset_mutex. While it is performing these checks, various
- * callback routines can briefly acquire callback_mutex to query cpusets.
- * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * callback routines can briefly acquire callback_lock to query cpusets.
+ * Once it is ready to make the changes, it takes callback_lock, blocking
* everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
- * callback_mutex, as that would risk double tripping on callback_mutex
+ * callback_lock, as that would risk double tripping on callback_lock
* from one of the callbacks into the cpuset code from within
* __alloc_pages().
*
- * If a task is only holding callback_mutex, then it has read-only
+ * If a task is only holding callback_lock, then it has read-only
* access to cpusets.
*
* Now, the task_struct fields mems_allowed and mempolicy may be changed
* by other task, we use alloc_lock in the task_struct fields to protect
* them.
*
- * The cpuset_common_file_read() handlers only hold callback_mutex across
+ * The cpuset_common_file_read() handlers only hold callback_lock across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
*
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
*/
static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_MUTEX(callback_mutex);
+static DEFINE_SPINLOCK(callback_lock);
/*
* CPU / memory hotplug is handled asynchronously.
@@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = {
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
- * Call with callback_mutex held.
+ * Call with callback_lock or cpuset_mutex held.
*/
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
@@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
- * Call with callback_mutex held.
+ * Call with callback_lock or cpuset_mutex held.
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
@@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Called with callback_mutex/cpuset_mutex held
+ * Call with callback_lock or cpuset_mutex held.
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -506,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
goto out;
}
+ /*
+ * We can't shrink if we won't have enough room for SCHED_DEADLINE
+ * tasks.
+ */
+ ret = -EBUSY;
+ if (is_cpu_exclusive(cur) &&
+ !cpuset_cpumask_can_shrink(cur->cpus_allowed,
+ trial->cpus_allowed))
+ goto out;
+
ret = 0;
out:
rcu_read_unlock();
@@ -876,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
continue;
rcu_read_unlock();
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, new_cpus);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -943,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
/* use trialcs->cpus_allowed as a temp variable */
update_cpumasks_hier(cs, trialcs->cpus_allowed);
@@ -1132,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
continue;
rcu_read_unlock();
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cp->effective_mems = *new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1155,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cpuset_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1202,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
/* use trialcs->mems_allowed as a temp variable */
update_nodemasks_hier(cs, &cs->mems_allowed);
@@ -1295,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
|| (is_spread_page(cs) != is_spread_page(trialcs)));
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cs->flags = trialcs->flags;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
rebuild_sched_domains_locked();
@@ -1429,17 +1439,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
goto out_unlock;
cgroup_taskset_for_each(task, tset) {
- /*
- * Kthreads which disallow setaffinity shouldn't be moved
- * to a new cpuset; we don't want to change their cpu
- * affinity and isolating such threads by their set of
- * allowed nodes is unnecessary. Thus, cpusets are not
- * applicable for such threads. This prevents checking for
- * success of set_cpus_allowed_ptr() on all attached tasks
- * before cpus_allowed may be changed.
- */
- ret = -EINVAL;
- if (task->flags & PF_NO_SETAFFINITY)
+ ret = task_can_attach(task, cs->cpus_allowed);
+ if (ret)
goto out_unlock;
ret = security_task_setscheduler(task);
if (ret)
@@ -1713,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
count = seq_get_buf(sf, &buf);
s = buf;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
switch (type) {
case FILE_CPULIST:
@@ -1740,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
seq_commit(sf, -1);
}
out_unlock:
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
return ret;
}
@@ -1957,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
if (cgroup_on_dfl(cs->css.cgroup)) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
}
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
goto out_unlock;
@@ -1989,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
}
rcu_read_unlock();
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cs->mems_allowed = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
out_unlock:
mutex_unlock(&cpuset_mutex);
return 0;
@@ -2031,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
mutex_lock(&cpuset_mutex);
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
if (cgroup_on_dfl(root_css->cgroup)) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2042,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
top_cpuset.mems_allowed = top_cpuset.effective_mems;
}
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
mutex_unlock(&cpuset_mutex);
}
@@ -2127,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
{
bool is_empty;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, new_cpus);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->mems_allowed = *new_mems;
cs->effective_mems = *new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2169,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs,
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->effective_mems = *new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
if (cpus_updated)
update_tasks_cpumask(cs);
@@ -2258,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
/* we don't mess with cpumasks of tasks in top_cpuset */
}
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
if (!on_dfl)
top_cpuset.mems_allowed = new_mems;
top_cpuset.effective_mems = new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
update_tasks_nodemask(&top_cpuset);
}
@@ -2365,11 +2366,13 @@ void __init cpuset_init_smp(void)
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
- mutex_lock(&callback_mutex);
+ unsigned long flags;
+
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
guarantee_online_cpus(task_cs(tsk), pmask);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
}
void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
@@ -2415,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void)
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
nodemask_t mask;
+ unsigned long flags;
- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
guarantee_online_mems(task_cs(tsk), &mask);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
return mask;
}
@@ -2439,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
/*
* nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
* mem_hardwall ancestor to the specified cpuset. Call holding
- * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
+ * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
* (an unusual configuration), then returns the root cpuset.
*/
static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
@@ -2450,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
}
/**
- * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * cpuset_node_allowed - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
@@ -2462,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* flag, yes.
* Otherwise, no.
*
- * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
- * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
- * might sleep, and might allow a node from an enclosing cpuset.
- *
- * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
- * cpusets, and never sleeps.
- *
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
* (in get_page_from_freelist()) refusing to consider the zones for
@@ -2481,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing hardwalled ancestor cpuset.
*
- * Scanning up parent cpusets requires callback_mutex. The
+ * Scanning up parent cpusets requires callback_lock. The
* __alloc_pages() routine only calls here with __GFP_HARDWALL bit
* _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
* current tasks mems_allowed came up empty on the first pass over
* the zonelist. So only GFP_KERNEL allocations, if all nodes in the
- * cpuset are short of memory, might require taking the callback_mutex
- * mutex.
+ * cpuset are short of memory, might require taking the callback_lock.
*
* The first call here from mm/page_alloc:get_page_from_freelist()
* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
@@ -2504,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* TIF_MEMDIE - any node ok
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
- *
- * Rule:
- * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
- * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
- * the code that might scan up ancestor cpusets and sleep.
*/
-int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+int __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
+ unsigned long flags;
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
- might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
if (node_isset(node, current->mems_allowed))
return 1;
/*
@@ -2533,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
return 1;
/* Not hardwall and node outside mems_allowed: scan up cpusets */
- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
allowed = node_isset(node, cs->mems_allowed);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
return allowed;
}
-/*
- * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
- * @node: is this an allowed node?
- * @gfp_mask: memory allocation flags
- *
- * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
- * set, yes, we can always allocate. If node is in our task's mems_allowed,
- * yes. If the task has been OOM killed and has access to memory reserves as
- * specified by the TIF_MEMDIE flag, yes.
- * Otherwise, no.
- *
- * The __GFP_THISNODE placement logic is really handled elsewhere,
- * by forcibly using a zonelist starting at a specified node, and by
- * (in get_page_from_freelist()) refusing to consider the zones for
- * any node on the zonelist except the first. By the time any such
- * calls get to this routine, we should just shut up and say 'yes'.
- *
- * Unlike the cpuset_node_allowed_softwall() variant, above,
- * this variant requires that the node be in the current task's
- * mems_allowed or that we're in interrupt. It does not scan up the
- * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
- * It never sleeps.
- */
-int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
-{
- if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
- return 1;
- if (node_isset(node, current->mems_allowed))
- return 1;
- /*
- * Allow tasks that have access to memory reserves because they have
- * been OOM killed to get memory anywhere.
- */
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
- return 1;
- return 0;
-}
-
/**
* cpuset_mem_spread_node() - On which node to begin search for a file page
* cpuset_slab_spread_node() - On which node to begin search for a slab page
@@ -2730,10 +2683,9 @@ void __cpuset_memory_pressure_bump(void)
* and we take cpuset_mutex, keeping cpuset_attach() from changing it
* anyway.
*/
-int proc_cpuset_show(struct seq_file *m, void *unused_v)
+int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
{
- struct pid *pid;
- struct task_struct *tsk;
char *buf, *p;
struct cgroup_subsys_state *css;
int retval;
@@ -2743,24 +2695,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
if (!buf)
goto out;
- retval = -ESRCH;
- pid = m->private;
- tsk = get_pid_task(pid, PIDTYPE_PID);
- if (!tsk)
- goto out_free;
-
retval = -ENAMETOOLONG;
rcu_read_lock();
css = task_css(tsk, cpuset_cgrp_id);
p = cgroup_path(css->cgroup, buf, PATH_MAX);
rcu_read_unlock();
if (!p)
- goto out_put_task;
+ goto out_free;
seq_puts(m, p);
seq_putc(m, '\n');
retval = 0;
-out_put_task:
- put_task_struct(tsk);
out_free:
kfree(buf);
out:
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index c766ee54c0b1..b64e238b553b 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -18,6 +18,7 @@ unsigned long saved_max_pfn;
* it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
*/
unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+EXPORT_SYMBOL_GPL(elfcorehdr_addr);
/*
* stores the size of elf header of crash image
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1adf62b39b96..07ce18ca71e0 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -27,6 +27,9 @@
* version 2. This program is licensed "as is" without any warranty of any
* kind, whether express or implied.
*/
+
+#define pr_fmt(fmt) "KGDB: " fmt
+
#include <linux/pid_namespace.h>
#include <linux/clocksource.h>
#include <linux/serial_core.h>
@@ -196,8 +199,8 @@ int __weak kgdb_validate_break_address(unsigned long addr)
return err;
err = kgdb_arch_remove_breakpoint(&tmp);
if (err)
- printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
- "memory destroyed at: %lx", addr);
+ pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n",
+ addr);
return err;
}
@@ -256,8 +259,8 @@ int dbg_activate_sw_breakpoints(void)
error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
if (error) {
ret = error;
- printk(KERN_INFO "KGDB: BP install failed: %lx",
- kgdb_break[i].bpt_addr);
+ pr_info("BP install failed: %lx\n",
+ kgdb_break[i].bpt_addr);
continue;
}
@@ -319,8 +322,8 @@ int dbg_deactivate_sw_breakpoints(void)
continue;
error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
if (error) {
- printk(KERN_INFO "KGDB: BP remove failed: %lx\n",
- kgdb_break[i].bpt_addr);
+ pr_info("BP remove failed: %lx\n",
+ kgdb_break[i].bpt_addr);
ret = error;
}
@@ -367,7 +370,7 @@ int dbg_remove_all_break(void)
goto setundefined;
error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
if (error)
- printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
+ pr_err("breakpoint remove failed: %lx\n",
kgdb_break[i].bpt_addr);
setundefined:
kgdb_break[i].state = BP_UNDEFINED;
@@ -400,9 +403,9 @@ static int kgdb_io_ready(int print_wait)
if (print_wait) {
#ifdef CONFIG_KGDB_KDB
if (!dbg_kdb_mode)
- printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
+ pr_crit("waiting... or $3#33 for KDB\n");
#else
- printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+ pr_crit("Waiting for remote debugger\n");
#endif
}
return 1;
@@ -430,8 +433,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
exception_level = 0;
kgdb_skipexception(ks->ex_vector, ks->linux_regs);
dbg_activate_sw_breakpoints();
- printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
- addr);
+ pr_crit("re-enter error: breakpoint removed %lx\n", addr);
WARN_ON_ONCE(1);
return 1;
@@ -444,7 +446,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
panic("Recursive entry to debugger");
}
- printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
+ pr_crit("re-enter exception: ALL breakpoints killed\n");
#ifdef CONFIG_KGDB_KDB
/* Allow kdb to debug itself one level */
return 0;
@@ -471,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
int cpu;
int trace_on = 0;
int online_cpus = num_online_cpus();
+ u64 time_left;
kgdb_info[ks->cpu].enter_kgdb++;
kgdb_info[ks->cpu].exception_state |= exception_state;
@@ -595,9 +598,13 @@ return_normal:
/*
* Wait for the other CPUs to be notified and be waiting for us:
*/
- while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
- atomic_read(&slaves_in_kgdb)) != online_cpus)
+ time_left = loops_per_jiffy * HZ;
+ while (kgdb_do_roundup && --time_left &&
+ (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
+ online_cpus)
cpu_relax();
+ if (!time_left)
+ pr_crit("KGDB: Timed out waiting for secondary CPUs.\n");
/*
* At this point the primary processor is completely
@@ -795,15 +802,15 @@ static struct console kgdbcons = {
static void sysrq_handle_dbg(int key)
{
if (!dbg_io_ops) {
- printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
+ pr_crit("ERROR: No KGDB I/O module available\n");
return;
}
if (!kgdb_connected) {
#ifdef CONFIG_KGDB_KDB
if (!dbg_kdb_mode)
- printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
+ pr_crit("KGDB or $3#33 for KDB\n");
#else
- printk(KERN_CRIT "Entering KGDB\n");
+ pr_crit("Entering KGDB\n");
#endif
}
@@ -945,7 +952,7 @@ static void kgdb_initial_breakpoint(void)
{
kgdb_break_asap = 0;
- printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
+ pr_crit("Waiting for connection from remote gdb...\n");
kgdb_breakpoint();
}
@@ -964,8 +971,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
if (dbg_io_ops) {
spin_unlock(&kgdb_registration_lock);
- printk(KERN_ERR "kgdb: Another I/O driver is already "
- "registered with KGDB.\n");
+ pr_err("Another I/O driver is already registered with KGDB\n");
return -EBUSY;
}
@@ -981,8 +987,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
spin_unlock(&kgdb_registration_lock);
- printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
- new_dbg_io_ops->name);
+ pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name);
/* Arm KGDB now. */
kgdb_register_callbacks();
@@ -1017,8 +1022,7 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
spin_unlock(&kgdb_registration_lock);
- printk(KERN_INFO
- "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
+ pr_info("Unregistered I/O driver %s, debugger disabled\n",
old_dbg_io_ops->name);
}
EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 70a504601dc3..e1dbf4a2c69e 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -52,11 +52,11 @@ static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
bp->bph_length = 1;
if ((argc + 1) != nextarg) {
- if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0)
+ if (strncasecmp(argv[nextarg], "datar", sizeof("datar")) == 0)
bp->bp_type = BP_ACCESS_WATCHPOINT;
- else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
+ else if (strncasecmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
bp->bp_type = BP_WRITE_WATCHPOINT;
- else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0)
+ else if (strncasecmp(argv[nextarg], "inst", sizeof("inst")) == 0)
bp->bp_type = BP_HARDWARE_BREAKPOINT;
else
return KDB_ARGCOUNT;
@@ -531,22 +531,29 @@ void __init kdb_initbptab(void)
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
bp->bp_free = 1;
- kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
- "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
- "Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("bp", kdb_bp, "[<vaddr>]",
+ "Set/Display breakpoints", 0,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("bl", kdb_bp, "[<vaddr>]",
+ "Display breakpoints", 0,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
- kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
- "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("bc", kdb_bc, "<bpnum>",
- "Clear Breakpoint", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("be", kdb_bc, "<bpnum>",
- "Enable Breakpoint", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("bd", kdb_bc, "<bpnum>",
- "Disable Breakpoint", 0, KDB_REPEAT_NONE);
-
- kdb_register_repeat("ss", kdb_ss, "",
- "Single Step", 1, KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("bph", kdb_bp, "[<vaddr>]",
+ "[datar [length]|dataw [length]] Set hw brk", 0,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("bc", kdb_bc, "<bpnum>",
+ "Clear Breakpoint", 0,
+ KDB_ENABLE_FLOW_CTRL);
+ kdb_register_flags("be", kdb_bc, "<bpnum>",
+ "Enable Breakpoint", 0,
+ KDB_ENABLE_FLOW_CTRL);
+ kdb_register_flags("bd", kdb_bc, "<bpnum>",
+ "Disable Breakpoint", 0,
+ KDB_ENABLE_FLOW_CTRL);
+
+ kdb_register_flags("ss", kdb_ss, "",
+ "Single Step", 1,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
/*
* Architecture dependent initialization.
*/
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8859ca34dcfe..15e1a7af5dd0 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks)
ks->pass_exception = 1;
KDB_FLAG_SET(CATASTROPHIC);
}
+ /* set CATASTROPHIC if the system contains unresponsive processors */
+ for_each_online_cpu(i)
+ if (!kgdb_info[i].enter_kgdb)
+ KDB_FLAG_SET(CATASTROPHIC);
if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
KDB_STATE_CLEAR(SSBPT);
KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 379650b984f8..f191bddf64b8 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -12,6 +12,7 @@
*/
#include <linux/ctype.h>
+#include <linux/types.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/kmsg_dump.h>
@@ -23,6 +24,7 @@
#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
@@ -42,6 +44,12 @@
#include <linux/slab.h>
#include "kdb_private.h"
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "kdb."
+
+static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE;
+module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600);
+
#define GREP_LEN 256
char kdb_grep_string[GREP_LEN];
int kdb_grepping_flag;
@@ -121,6 +129,7 @@ static kdbmsg_t kdbmsgs[] = {
KDBMSG(BADLENGTH, "Invalid length field"),
KDBMSG(NOBP, "No Breakpoint exists"),
KDBMSG(BADADDR, "Invalid address"),
+ KDBMSG(NOPERM, "Permission denied"),
};
#undef KDBMSG
@@ -188,6 +197,26 @@ struct task_struct *kdb_curr_task(int cpu)
}
/*
+ * Check whether the flags of the current command and the permissions
+ * of the kdb console has allow a command to be run.
+ */
+static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
+ bool no_args)
+{
+ /* permissions comes from userspace so needs massaging slightly */
+ permissions &= KDB_ENABLE_MASK;
+ permissions |= KDB_ENABLE_ALWAYS_SAFE;
+
+ /* some commands change group when launched with no arguments */
+ if (no_args)
+ permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT;
+
+ flags |= KDB_ENABLE_ALL;
+
+ return permissions & flags;
+}
+
+/*
* kdbgetenv - This function will return the character string value of
* an environment variable.
* Parameters:
@@ -476,6 +505,15 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
kdb_symtab_t symtab;
/*
+ * If the enable flags prohibit both arbitrary memory access
+ * and flow control then there are no reasonable grounds to
+ * provide symbol lookup.
+ */
+ if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL,
+ kdb_cmd_enabled, false))
+ return KDB_NOPERM;
+
+ /*
* Process arguments which follow the following syntax:
*
* symbol | numeric-address [+/- numeric-offset]
@@ -641,8 +679,13 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
if (!s->count)
s->usable = 0;
if (s->usable)
- kdb_register(s->name, kdb_exec_defcmd,
- s->usage, s->help, 0);
+ /* macros are always safe because when executed each
+ * internal command re-enters kdb_parse() and is
+ * safety checked individually.
+ */
+ kdb_register_flags(s->name, kdb_exec_defcmd, s->usage,
+ s->help, 0,
+ KDB_ENABLE_ALWAYS_SAFE);
return 0;
}
if (!s->usable)
@@ -1003,25 +1046,22 @@ int kdb_parse(const char *cmdstr)
if (i < kdb_max_commands) {
int result;
+
+ if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
+ return KDB_NOPERM;
+
KDB_STATE_SET(CMD);
result = (*tp->cmd_func)(argc-1, (const char **)argv);
if (result && ignore_errors && result > KDB_CMD_GO)
result = 0;
KDB_STATE_CLEAR(CMD);
- switch (tp->cmd_repeat) {
- case KDB_REPEAT_NONE:
- argc = 0;
- if (argv[0])
- *(argv[0]) = '\0';
- break;
- case KDB_REPEAT_NO_ARGS:
- argc = 1;
- if (argv[1])
- *(argv[1]) = '\0';
- break;
- case KDB_REPEAT_WITH_ARGS:
- break;
- }
+
+ if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS)
+ return result;
+
+ argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
+ if (argv[argc])
+ *(argv[argc]) = '\0';
return result;
}
@@ -1921,10 +1961,14 @@ static int kdb_rm(int argc, const char **argv)
*/
static int kdb_sr(int argc, const char **argv)
{
+ bool check_mask =
+ !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false);
+
if (argc != 1)
return KDB_ARGCOUNT;
+
kdb_trap_printk++;
- __handle_sysrq(*argv[1], false);
+ __handle_sysrq(*argv[1], check_mask);
kdb_trap_printk--;
return 0;
@@ -2157,6 +2201,8 @@ static void kdb_cpu_status(void)
for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
if (!cpu_online(i)) {
state = 'F'; /* cpu is offline */
+ } else if (!kgdb_info[i].enter_kgdb) {
+ state = 'D'; /* cpu is online but unresponsive */
} else {
state = ' '; /* cpu is responding to kdb */
if (kdb_task_state_char(KDB_TSK(i)) == 'I')
@@ -2210,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv)
/*
* Validate cpunum
*/
- if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
+ if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
return KDB_BADCPUNUM;
dbg_switch_cpu = cpunum;
@@ -2375,6 +2421,8 @@ static int kdb_help(int argc, const char **argv)
return 0;
if (!kt->cmd_name)
continue;
+ if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
+ continue;
if (strlen(kt->cmd_usage) > 20)
space = "\n ";
kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
@@ -2629,7 +2677,7 @@ static int kdb_grep_help(int argc, const char **argv)
}
/*
- * kdb_register_repeat - This function is used to register a kernel
+ * kdb_register_flags - This function is used to register a kernel
* debugger command.
* Inputs:
* cmd Command name
@@ -2641,12 +2689,12 @@ static int kdb_grep_help(int argc, const char **argv)
* zero for success, one if a duplicate command.
*/
#define kdb_command_extend 50 /* arbitrary */
-int kdb_register_repeat(char *cmd,
- kdb_func_t func,
- char *usage,
- char *help,
- short minlen,
- kdb_repeat_t repeat)
+int kdb_register_flags(char *cmd,
+ kdb_func_t func,
+ char *usage,
+ char *help,
+ short minlen,
+ kdb_cmdflags_t flags)
{
int i;
kdbtab_t *kp;
@@ -2694,19 +2742,18 @@ int kdb_register_repeat(char *cmd,
kp->cmd_func = func;
kp->cmd_usage = usage;
kp->cmd_help = help;
- kp->cmd_flags = 0;
kp->cmd_minlen = minlen;
- kp->cmd_repeat = repeat;
+ kp->cmd_flags = flags;
return 0;
}
-EXPORT_SYMBOL_GPL(kdb_register_repeat);
+EXPORT_SYMBOL_GPL(kdb_register_flags);
/*
* kdb_register - Compatibility register function for commands that do
* not need to specify a repeat state. Equivalent to
- * kdb_register_repeat with KDB_REPEAT_NONE.
+ * kdb_register_flags with flags set to 0.
* Inputs:
* cmd Command name
* func Function to execute the command
@@ -2721,8 +2768,7 @@ int kdb_register(char *cmd,
char *help,
short minlen)
{
- return kdb_register_repeat(cmd, func, usage, help, minlen,
- KDB_REPEAT_NONE);
+ return kdb_register_flags(cmd, func, usage, help, minlen, 0);
}
EXPORT_SYMBOL_GPL(kdb_register);
@@ -2764,80 +2810,109 @@ static void __init kdb_inittab(void)
for_each_kdbcmd(kp, i)
kp->cmd_name = NULL;
- kdb_register_repeat("md", kdb_md, "<vaddr>",
+ kdb_register_flags("md", kdb_md, "<vaddr>",
"Display Memory Contents, also mdWcN, e.g. md8c1", 1,
- KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
- "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
- "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("mds", kdb_md, "<vaddr>",
- "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
- "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("go", kdb_go, "[<vaddr>]",
- "Continue Execution", 1, KDB_REPEAT_NONE);
- kdb_register_repeat("rd", kdb_rd, "",
- "Display Registers", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
- "Modify Registers", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("ef", kdb_ef, "<vaddr>",
- "Display exception frame", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
- "Stack traceback", 1, KDB_REPEAT_NONE);
- kdb_register_repeat("btp", kdb_bt, "<pid>",
- "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
- "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("btc", kdb_bt, "",
- "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("btt", kdb_bt, "<vaddr>",
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>",
+ "Display Raw Memory", 0,
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>",
+ "Display Physical Memory", 0,
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mds", kdb_md, "<vaddr>",
+ "Display Memory Symbolically", 0,
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>",
+ "Modify Memory Contents", 0,
+ KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("go", kdb_go, "[<vaddr>]",
+ "Continue Execution", 1,
+ KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
+ kdb_register_flags("rd", kdb_rd, "",
+ "Display Registers", 0,
+ KDB_ENABLE_REG_READ);
+ kdb_register_flags("rm", kdb_rm, "<reg> <contents>",
+ "Modify Registers", 0,
+ KDB_ENABLE_REG_WRITE);
+ kdb_register_flags("ef", kdb_ef, "<vaddr>",
+ "Display exception frame", 0,
+ KDB_ENABLE_MEM_READ);
+ kdb_register_flags("bt", kdb_bt, "[<vaddr>]",
+ "Stack traceback", 1,
+ KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
+ kdb_register_flags("btp", kdb_bt, "<pid>",
+ "Display stack for process <pid>", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
+ "Backtrace all processes matching state flag", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("btc", kdb_bt, "",
+ "Backtrace current process on each cpu", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("btt", kdb_bt, "<vaddr>",
"Backtrace process given its struct task address", 0,
- KDB_REPEAT_NONE);
- kdb_register_repeat("env", kdb_env, "",
- "Show environment variables", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("set", kdb_set, "",
- "Set environment variables", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("help", kdb_help, "",
- "Display Help Message", 1, KDB_REPEAT_NONE);
- kdb_register_repeat("?", kdb_help, "",
- "Display Help Message", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
- "Switch to new cpu", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("kgdb", kdb_kgdb, "",
- "Enter kgdb mode", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
- "Display active task list", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("pid", kdb_pid, "<pidnum>",
- "Switch to another task", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("reboot", kdb_reboot, "",
- "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
+ KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
+ kdb_register_flags("env", kdb_env, "",
+ "Show environment variables", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("set", kdb_set, "",
+ "Set environment variables", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("help", kdb_help, "",
+ "Display Help Message", 1,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("?", kdb_help, "",
+ "Display Help Message", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("cpu", kdb_cpu, "<cpunum>",
+ "Switch to new cpu", 0,
+ KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
+ kdb_register_flags("kgdb", kdb_kgdb, "",
+ "Enter kgdb mode", 0, 0);
+ kdb_register_flags("ps", kdb_ps, "[<flags>|A]",
+ "Display active task list", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("pid", kdb_pid, "<pidnum>",
+ "Switch to another task", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("reboot", kdb_reboot, "",
+ "Reboot the machine immediately", 0,
+ KDB_ENABLE_REBOOT);
#if defined(CONFIG_MODULES)
- kdb_register_repeat("lsmod", kdb_lsmod, "",
- "List loaded kernel modules", 0, KDB_REPEAT_NONE);
+ kdb_register_flags("lsmod", kdb_lsmod, "",
+ "List loaded kernel modules", 0,
+ KDB_ENABLE_INSPECT);
#endif
#if defined(CONFIG_MAGIC_SYSRQ)
- kdb_register_repeat("sr", kdb_sr, "<key>",
- "Magic SysRq key", 0, KDB_REPEAT_NONE);
+ kdb_register_flags("sr", kdb_sr, "<key>",
+ "Magic SysRq key", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
#endif
#if defined(CONFIG_PRINTK)
- kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
- "Display syslog buffer", 0, KDB_REPEAT_NONE);
+ kdb_register_flags("dmesg", kdb_dmesg, "[lines]",
+ "Display syslog buffer", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
#endif
if (arch_kgdb_ops.enable_nmi) {
- kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
- "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
- }
- kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
- "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
- "Send a signal to a process", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("summary", kdb_summary, "",
- "Summarize the system", 4, KDB_REPEAT_NONE);
- kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
- "Display per_cpu variables", 3, KDB_REPEAT_NONE);
- kdb_register_repeat("grephelp", kdb_grep_help, "",
- "Display help on | grep", 0, KDB_REPEAT_NONE);
+ kdb_register_flags("disable_nmi", kdb_disable_nmi, "",
+ "Disable NMI entry to KDB", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ }
+ kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
+ "Define a set of commands, down to endefcmd", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("kill", kdb_kill, "<-signal> <pid>",
+ "Send a signal to a process", 0,
+ KDB_ENABLE_SIGNAL);
+ kdb_register_flags("summary", kdb_summary, "",
+ "Summarize the system", 4,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
+ "Display per_cpu variables", 3,
+ KDB_ENABLE_MEM_READ);
+ kdb_register_flags("grephelp", kdb_grep_help, "",
+ "Display help on | grep", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
}
/* Execute any commands defined in kdb_cmds. */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 7afd3c8c41d5..eaacd1693954 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -172,10 +172,9 @@ typedef struct _kdbtab {
kdb_func_t cmd_func; /* Function to execute command */
char *cmd_usage; /* Usage String for this command */
char *cmd_help; /* Help message for this command */
- short cmd_flags; /* Parsing flags */
short cmd_minlen; /* Minimum legal # command
* chars required */
- kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */
+ kdb_cmdflags_t cmd_flags; /* Command behaviour flags */
} kdbtab_t;
extern int kdb_bt(int, const char **); /* KDB display back trace */
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 97b67df8fbfe..d659487254d5 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -52,7 +52,7 @@ static void release_callchain_buffers(void)
struct callchain_cpus_entries *entries;
entries = callchain_cpus_entries;
- rcu_assign_pointer(callchain_cpus_entries, NULL);
+ RCU_INIT_POINTER(callchain_cpus_entries, NULL);
call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
}
@@ -137,7 +137,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
int cpu;
struct callchain_cpus_entries *entries;
- *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+ *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion));
if (*rctx == -1)
return NULL;
@@ -153,7 +153,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
static void
put_callchain_entry(int rctx)
{
- put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+ put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
}
struct perf_callchain_entry *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 963bf139e2b2..882f835a0d85 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -47,6 +47,8 @@
#include <asm/irq_regs.h>
+static struct workqueue_struct *perf_wq;
+
struct remote_function_call {
struct task_struct *p;
int (*func)(void *info);
@@ -120,6 +122,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
return data.ret;
}
+#define EVENT_OWNER_KERNEL ((void *) -1)
+
+static bool is_kernel_event(struct perf_event *event)
+{
+ return event->owner == EVENT_OWNER_KERNEL;
+}
+
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
PERF_FLAG_PID_CGROUP |\
@@ -240,7 +249,7 @@ static void perf_duration_warn(struct irq_work *w)
u64 avg_local_sample_len;
u64 local_samples_len;
- local_samples_len = __get_cpu_var(running_sample_length);
+ local_samples_len = __this_cpu_read(running_sample_length);
avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
printk_ratelimited(KERN_WARNING
@@ -262,10 +271,10 @@ void perf_sample_event_took(u64 sample_len_ns)
return;
/* decay the counter by 1 average sample */
- local_samples_len = __get_cpu_var(running_sample_length);
+ local_samples_len = __this_cpu_read(running_sample_length);
local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
local_samples_len += sample_len_ns;
- __get_cpu_var(running_sample_length) = local_samples_len;
+ __this_cpu_write(running_sample_length, local_samples_len);
/*
* note: this will be biased artifically low until we have
@@ -392,14 +401,9 @@ perf_cgroup_match(struct perf_event *event)
event->cgrp->css.cgroup);
}
-static inline void perf_put_cgroup(struct perf_event *event)
-{
- css_put(&event->cgrp->css);
-}
-
static inline void perf_detach_cgroup(struct perf_event *event)
{
- perf_put_cgroup(event);
+ css_put(&event->cgrp->css);
event->cgrp = NULL;
}
@@ -610,7 +614,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
if (!f.file)
return -EBADF;
- css = css_tryget_online_from_dir(f.file->f_dentry,
+ css = css_tryget_online_from_dir(f.file->f_path.dentry,
&perf_event_cgrp_subsys);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
@@ -878,7 +882,7 @@ static DEFINE_PER_CPU(struct list_head, rotation_list);
static void perf_pmu_rotate_start(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- struct list_head *head = &__get_cpu_var(rotation_list);
+ struct list_head *head = this_cpu_ptr(&rotation_list);
WARN_ON(!irqs_disabled());
@@ -902,13 +906,23 @@ static void put_ctx(struct perf_event_context *ctx)
}
}
-static void unclone_ctx(struct perf_event_context *ctx)
+/*
+ * This must be done under the ctx->lock, such as to serialize against
+ * context_equiv(), therefore we cannot call put_ctx() since that might end up
+ * calling scheduler related locks and ctx->lock nests inside those.
+ */
+static __must_check struct perf_event_context *
+unclone_ctx(struct perf_event_context *ctx)
{
- if (ctx->parent_ctx) {
- put_ctx(ctx->parent_ctx);
+ struct perf_event_context *parent_ctx = ctx->parent_ctx;
+
+ lockdep_assert_held(&ctx->lock);
+
+ if (parent_ctx)
ctx->parent_ctx = NULL;
- }
ctx->generation++;
+
+ return parent_ctx;
}
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1375,6 +1389,45 @@ out:
perf_event__header_size(tmp);
}
+/*
+ * User event without the task.
+ */
+static bool is_orphaned_event(struct perf_event *event)
+{
+ return event && !is_kernel_event(event) && !event->owner;
+}
+
+/*
+ * Event has a parent but parent's task finished and it's
+ * alive only because of children holding refference.
+ */
+static bool is_orphaned_child(struct perf_event *event)
+{
+ return is_orphaned_event(event->parent);
+}
+
+static void orphans_remove_work(struct work_struct *work);
+
+static void schedule_orphans_remove(struct perf_event_context *ctx)
+{
+ if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
+ return;
+
+ if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
+ get_ctx(ctx);
+ ctx->orphans_remove_sched = true;
+ }
+}
+
+static int __init perf_workqueue_init(void)
+{
+ perf_wq = create_singlethread_workqueue("perf");
+ WARN(!perf_wq, "failed to create perf workqueue\n");
+ return perf_wq ? 0 : -1;
+}
+
+core_initcall(perf_workqueue_init);
+
static inline int
event_filter_match(struct perf_event *event)
{
@@ -1424,6 +1477,9 @@ event_sched_out(struct perf_event *event,
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
+ if (is_orphaned_child(event))
+ schedule_orphans_remove(ctx);
+
perf_pmu_enable(event->pmu);
}
@@ -1506,8 +1562,10 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group
if (!task) {
/*
- * Per cpu events are removed via an smp call and
- * the removal is always successful.
+ * Per cpu events are removed via an smp call. The removal can
+ * fail if the CPU is currently offline, but in that case we
+ * already called __perf_remove_from_context from
+ * perf_event_exit_cpu.
*/
cpu_function_call(event->cpu, __perf_remove_from_context, &re);
return;
@@ -1731,6 +1789,9 @@ event_sched_in(struct perf_event *event,
if (event->attr.exclusive)
cpuctx->exclusive = 1;
+ if (is_orphaned_child(event))
+ schedule_orphans_remove(ctx);
+
out:
perf_pmu_enable(event->pmu);
@@ -2210,6 +2271,9 @@ static void ctx_sched_out(struct perf_event_context *ctx,
static int context_equiv(struct perf_event_context *ctx1,
struct perf_event_context *ctx2)
{
+ lockdep_assert_held(&ctx1->lock);
+ lockdep_assert_held(&ctx2->lock);
+
/* Pinning disables the swap optimization */
if (ctx1->pin_count || ctx2->pin_count)
return 0;
@@ -2331,7 +2395,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
next_parent = rcu_dereference(next_ctx->parent_ctx);
/* If neither context have a parent context; they cannot be clones. */
- if (!parent || !next_parent)
+ if (!parent && !next_parent)
goto unlock;
if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -2400,7 +2464,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
* to check if we have to switch out PMU state.
* cgroup event are system-wide mode only
*/
- if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+ if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
perf_cgroup_sched_out(task, next);
}
@@ -2643,11 +2707,11 @@ void __perf_event_task_sched_in(struct task_struct *prev,
* to check if we have to switch in PMU state.
* cgroup event are system-wide mode only
*/
- if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+ if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
perf_cgroup_sched_in(prev, task);
/* check for system-wide branch_stack events */
- if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
+ if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
perf_branch_stack_sched_in(prev, task);
}
@@ -2902,7 +2966,7 @@ bool perf_event_can_stop_tick(void)
void perf_event_task_tick(void)
{
- struct list_head *head = &__get_cpu_var(rotation_list);
+ struct list_head *head = this_cpu_ptr(&rotation_list);
struct perf_cpu_context *cpuctx, *tmp;
struct perf_event_context *ctx;
int throttled;
@@ -2943,6 +3007,7 @@ static int event_enable_on_exec(struct perf_event *event,
*/
static void perf_event_enable_on_exec(struct perf_event_context *ctx)
{
+ struct perf_event_context *clone_ctx = NULL;
struct perf_event *event;
unsigned long flags;
int enabled = 0;
@@ -2974,7 +3039,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
* Unclone this context if we enabled any event.
*/
if (enabled)
- unclone_ctx(ctx);
+ clone_ctx = unclone_ctx(ctx);
raw_spin_unlock(&ctx->lock);
@@ -2984,6 +3049,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
perf_event_context_sched_in(ctx, ctx->task);
out:
local_irq_restore(flags);
+
+ if (clone_ctx)
+ put_ctx(clone_ctx);
}
void perf_event_exec(void)
@@ -3078,6 +3146,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
INIT_LIST_HEAD(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
+ INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
}
static struct perf_event_context *
@@ -3135,7 +3204,7 @@ errout:
static struct perf_event_context *
find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
{
- struct perf_event_context *ctx;
+ struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_cpu_context *cpuctx;
unsigned long flags;
int ctxn, err;
@@ -3169,9 +3238,12 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
retry:
ctx = perf_lock_task_context(task, ctxn, &flags);
if (ctx) {
- unclone_ctx(ctx);
+ clone_ctx = unclone_ctx(ctx);
++ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+ if (clone_ctx)
+ put_ctx(clone_ctx);
} else {
ctx = alloc_perf_context(pmu, task);
err = -ENOMEM;
@@ -3323,16 +3395,12 @@ static void free_event(struct perf_event *event)
}
/*
- * Called when the last reference to the file is gone.
+ * Remove user event from the owner task.
*/
-static void put_event(struct perf_event *event)
+static void perf_remove_from_owner(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
struct task_struct *owner;
- if (!atomic_long_dec_and_test(&event->refcount))
- return;
-
rcu_read_lock();
owner = ACCESS_ONCE(event->owner);
/*
@@ -3365,6 +3433,20 @@ static void put_event(struct perf_event *event)
mutex_unlock(&owner->perf_event_mutex);
put_task_struct(owner);
}
+}
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static void put_event(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+
+ if (!atomic_long_dec_and_test(&event->refcount))
+ return;
+
+ if (!is_kernel_event(event))
+ perf_remove_from_owner(event);
WARN_ON_ONCE(ctx->parent_ctx);
/*
@@ -3399,6 +3481,42 @@ static int perf_release(struct inode *inode, struct file *file)
return 0;
}
+/*
+ * Remove all orphanes events from the context.
+ */
+static void orphans_remove_work(struct work_struct *work)
+{
+ struct perf_event_context *ctx;
+ struct perf_event *event, *tmp;
+
+ ctx = container_of(work, struct perf_event_context,
+ orphans_remove.work);
+
+ mutex_lock(&ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
+ struct perf_event *parent_event = event->parent;
+
+ if (!is_orphaned_child(event))
+ continue;
+
+ perf_remove_from_context(event, true);
+
+ mutex_lock(&parent_event->child_mutex);
+ list_del_init(&event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
+
+ free_event(event);
+ put_event(parent_event);
+ }
+
+ raw_spin_lock_irq(&ctx->lock);
+ ctx->orphans_remove_sched = false;
+ raw_spin_unlock_irq(&ctx->lock);
+ mutex_unlock(&ctx->mutex);
+
+ put_ctx(ctx);
+}
+
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
@@ -3496,6 +3614,19 @@ static int perf_event_read_one(struct perf_event *event,
return n * sizeof(u64);
}
+static bool is_event_hup(struct perf_event *event)
+{
+ bool no_children;
+
+ if (event->state != PERF_EVENT_STATE_EXIT)
+ return false;
+
+ mutex_lock(&event->child_mutex);
+ no_children = list_empty(&event->child_list);
+ mutex_unlock(&event->child_mutex);
+ return no_children;
+}
+
/*
* Read the performance event - simple non blocking version for now
*/
@@ -3537,7 +3668,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
{
struct perf_event *event = file->private_data;
struct ring_buffer *rb;
- unsigned int events = POLL_HUP;
+ unsigned int events = POLLHUP;
+
+ poll_wait(file, &event->waitq, wait);
+
+ if (is_event_hup(event))
+ return events;
/*
* Pin the event->rb by taking event->mmap_mutex; otherwise
@@ -3548,9 +3684,6 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
if (rb)
events = atomic_xchg(&rb->poll, 0);
mutex_unlock(&event->mmap_mutex);
-
- poll_wait(file, &event->waitq, wait);
-
return events;
}
@@ -4327,22 +4460,29 @@ perf_output_sample_regs(struct perf_output_handle *handle,
}
}
-static void perf_sample_regs_user(struct perf_regs_user *regs_user,
- struct pt_regs *regs)
+static void perf_sample_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs,
+ struct pt_regs *regs_user_copy)
{
- if (!user_mode(regs)) {
- if (current->mm)
- regs = task_pt_regs(current);
- else
- regs = NULL;
- }
-
- if (regs) {
+ if (user_mode(regs)) {
+ regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
- regs_user->abi = perf_reg_abi(current);
+ } else if (current->mm) {
+ perf_get_regs_user(regs_user, regs, regs_user_copy);
+ } else {
+ regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
+ regs_user->regs = NULL;
}
}
+static void perf_sample_regs_intr(struct perf_regs *regs_intr,
+ struct pt_regs *regs)
+{
+ regs_intr->regs = regs;
+ regs_intr->abi = perf_reg_abi(current);
+}
+
+
/*
* Get remaining task size from user stack pointer.
*
@@ -4724,6 +4864,23 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_TRANSACTION)
perf_output_put(handle, data->txn);
+ if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ u64 abi = data->regs_intr.abi;
+ /*
+ * If there are no regs to dump, notice it through
+ * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
+ */
+ perf_output_put(handle, abi);
+
+ if (abi) {
+ u64 mask = event->attr.sample_regs_intr;
+
+ perf_output_sample_regs(handle,
+ data->regs_intr.regs,
+ mask);
+ }
+ }
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -4789,12 +4946,14 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
+ if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
+ perf_sample_regs_user(&data->regs_user, regs,
+ &data->regs_user_copy);
+
if (sample_type & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
int size = sizeof(u64);
- perf_sample_regs_user(&data->regs_user, regs);
-
if (data->regs_user.regs) {
u64 mask = event->attr.sample_regs_user;
size += hweight64(mask) * sizeof(u64);
@@ -4810,15 +4969,11 @@ void perf_prepare_sample(struct perf_event_header *header,
* in case new sample type is added, because we could eat
* up the rest of the sample size.
*/
- struct perf_regs_user *uregs = &data->regs_user;
u16 stack_size = event->attr.sample_stack_user;
u16 size = sizeof(u64);
- if (!uregs->abi)
- perf_sample_regs_user(uregs, regs);
-
stack_size = perf_sample_ustack_size(stack_size, header->size,
- uregs->regs);
+ data->regs_user.regs);
/*
* If there is something to dump, add space for the dump
@@ -4831,6 +4986,21 @@ void perf_prepare_sample(struct perf_event_header *header,
data->stack_user_size = stack_size;
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ /* regs dump ABI info */
+ int size = sizeof(u64);
+
+ perf_sample_regs_intr(&data->regs_intr, regs);
+
+ if (data->regs_intr.regs) {
+ u64 mask = event->attr.sample_regs_intr;
+
+ size += hweight64(mask) * sizeof(u64);
+ }
+
+ header->size += size;
+ }
}
static void perf_event_output(struct perf_event *event,
@@ -5702,7 +5872,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
struct perf_sample_data *data,
struct pt_regs *regs)
{
- struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
struct perf_event *event;
struct hlist_head *head;
@@ -5721,7 +5891,7 @@ end:
int perf_swevent_get_recursion_context(void)
{
- struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
return get_recursion_context(swhash->recursion);
}
@@ -5729,7 +5899,7 @@ EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
inline void perf_swevent_put_recursion_context(int rctx)
{
- struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
put_recursion_context(swhash->recursion, rctx);
}
@@ -5758,7 +5928,7 @@ static void perf_swevent_read(struct perf_event *event)
static int perf_swevent_add(struct perf_event *event, int flags)
{
- struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
struct hw_perf_event *hwc = &event->hw;
struct hlist_head *head;
@@ -5814,7 +5984,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
if (!hlist)
return;
- rcu_assign_pointer(swhash->swevent_hlist, NULL);
+ RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
kfree_rcu(hlist, rcu_head);
}
@@ -5940,11 +6110,6 @@ static int perf_swevent_init(struct perf_event *event)
return 0;
}
-static int perf_swevent_event_idx(struct perf_event *event)
-{
- return 0;
-}
-
static struct pmu perf_swevent = {
.task_ctx_nr = perf_sw_context,
@@ -5954,8 +6119,6 @@ static struct pmu perf_swevent = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
-
- .event_idx = perf_swevent_event_idx,
};
#ifdef CONFIG_EVENT_TRACING
@@ -6073,8 +6236,6 @@ static struct pmu perf_tracepoint = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
-
- .event_idx = perf_swevent_event_idx,
};
static inline void perf_tp_register(void)
@@ -6300,8 +6461,6 @@ static struct pmu perf_cpu_clock = {
.start = cpu_clock_event_start,
.stop = cpu_clock_event_stop,
.read = cpu_clock_event_read,
-
- .event_idx = perf_swevent_event_idx,
};
/*
@@ -6380,8 +6539,6 @@ static struct pmu perf_task_clock = {
.start = task_clock_event_start,
.stop = task_clock_event_stop,
.read = task_clock_event_read,
-
- .event_idx = perf_swevent_event_idx,
};
static void perf_pmu_nop_void(struct pmu *pmu)
@@ -6411,7 +6568,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
static int perf_event_idx_default(struct perf_event *event)
{
- return event->hw.idx + 1;
+ return 0;
}
/*
@@ -7031,6 +7188,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
ret = -EINVAL;
}
+ if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
+ ret = perf_reg_validate(attr->sample_regs_intr);
out:
return ret;
@@ -7315,11 +7474,11 @@ SYSCALL_DEFINE5(perf_event_open,
if (move_group) {
synchronize_rcu();
- perf_install_in_context(ctx, group_leader, event->cpu);
+ perf_install_in_context(ctx, group_leader, group_leader->cpu);
get_ctx(ctx);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_install_in_context(ctx, sibling, event->cpu);
+ perf_install_in_context(ctx, sibling, sibling->cpu);
get_ctx(ctx);
}
}
@@ -7397,6 +7556,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err;
}
+ /* Mark owner so we could distinguish it from user events. */
+ event->owner = EVENT_OWNER_KERNEL;
+
account_event(event);
ctx = find_get_context(event->pmu, task, cpu);
@@ -7484,6 +7646,12 @@ static void sync_child_event(struct perf_event *child_event,
mutex_unlock(&parent_event->child_mutex);
/*
+ * Make sure user/parent get notified, that we just
+ * lost one event.
+ */
+ perf_event_wakeup(parent_event);
+
+ /*
* Release the parent event, if this was the last
* reference to it.
*/
@@ -7517,13 +7685,16 @@ __perf_event_exit_task(struct perf_event *child_event,
if (child_event->parent) {
sync_child_event(child_event, child);
free_event(child_event);
+ } else {
+ child_event->state = PERF_EVENT_STATE_EXIT;
+ perf_event_wakeup(child_event);
}
}
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
struct perf_event *child_event, *next;
- struct perf_event_context *child_ctx, *parent_ctx;
+ struct perf_event_context *child_ctx, *clone_ctx = NULL;
unsigned long flags;
if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -7550,28 +7721,16 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
child->perf_event_ctxp[ctxn] = NULL;
/*
- * In order to avoid freeing: child_ctx->parent_ctx->task
- * under perf_event_context::lock, grab another reference.
- */
- parent_ctx = child_ctx->parent_ctx;
- if (parent_ctx)
- get_ctx(parent_ctx);
-
- /*
* If this context is a clone; unclone it so it can't get
* swapped to another process while we're removing all
* the events from it.
*/
- unclone_ctx(child_ctx);
+ clone_ctx = unclone_ctx(child_ctx);
update_context_time(child_ctx);
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
- /*
- * Now that we no longer hold perf_event_context::lock, drop
- * our extra child_ctx->parent_ctx reference.
- */
- if (parent_ctx)
- put_ctx(parent_ctx);
+ if (clone_ctx)
+ put_ctx(clone_ctx);
/*
* Report the task dead after unscheduling the events so that we
@@ -7700,6 +7859,7 @@ inherit_event(struct perf_event *parent_event,
struct perf_event *group_leader,
struct perf_event_context *child_ctx)
{
+ enum perf_event_active_state parent_state = parent_event->state;
struct perf_event *child_event;
unsigned long flags;
@@ -7720,7 +7880,8 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
- if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ if (is_orphaned_event(parent_event) ||
+ !atomic_long_inc_not_zero(&parent_event->refcount)) {
free_event(child_event);
return NULL;
}
@@ -7732,7 +7893,7 @@ inherit_event(struct perf_event *parent_event,
* not its attr.disabled bit. We hold the parent's mutex,
* so we won't race with perf_event_{en, dis}able_family.
*/
- if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
+ if (parent_state >= PERF_EVENT_STATE_INACTIVE)
child_event->state = PERF_EVENT_STATE_INACTIVE;
else
child_event->state = PERF_EVENT_STATE_OFF;
@@ -7997,7 +8158,7 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
static void __perf_event_exit_context(void *__info)
{
- struct remove_event re = { .detach_group = false };
+ struct remove_event re = { .detach_group = true };
struct perf_event_context *ctx = __info;
perf_pmu_rotate_stop(ctx->pmu);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 1559fb0b9296..9803a6600d49 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
bp->hw.state = PERF_HES_STOPPED;
}
-static int hw_breakpoint_event_idx(struct perf_event *bp)
-{
- return 0;
-}
-
static struct pmu perf_breakpoint = {
.task_ctx_nr = perf_sw_context, /* could eventually get its own */
@@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = {
.start = hw_breakpoint_start,
.stop = hw_breakpoint_stop,
.read = hw_breakpoint_pmu_read,
-
- .event_idx = hw_breakpoint_event_idx,
};
int __init init_hw_breakpoint(void)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1d0af8a2c646..cb346f26a22d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
}
flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush(vma, addr, ptep);
+ ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
page_remove_rmap(page);
@@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
int more = 0;
again:
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
if (!valid_vma(vma, is_register))
continue;
if (!prev && !more) {
/*
- * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
+ * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
* reclaim. This is optimistic, no harm done if it fails.
*/
prev = kmalloc(sizeof(struct map_info),
@@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
info->mm = vma->vm_mm;
info->vaddr = offset_to_vaddr(vma, offset);
}
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_read(mapping);
if (!more)
goto out;
@@ -1640,7 +1640,6 @@ bool uprobe_deny_signal(void)
if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
utask->state = UTASK_SSTEP_TRAPPED;
set_tsk_thread_flag(t, TIF_UPROBE);
- set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
}
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 32c58f7433a3..6806c55475ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -115,32 +115,30 @@ static void __exit_signal(struct task_struct *tsk)
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
- /*
- * Accumulate here the counters for all threads but the
- * group leader as they die, so they can be added into
- * the process-wide totals when those are taken.
- * The group leader stays around as a zombie as long
- * as there are other threads. When it gets reaped,
- * the exit.c code will add its counts into these totals.
- * We won't ever get here for the group leader, since it
- * will have been the last reference on the signal_struct.
- */
- task_cputime(tsk, &utime, &stime);
- sig->utime += utime;
- sig->stime += stime;
- sig->gtime += task_gtime(tsk);
- sig->min_flt += tsk->min_flt;
- sig->maj_flt += tsk->maj_flt;
- sig->nvcsw += tsk->nvcsw;
- sig->nivcsw += tsk->nivcsw;
- sig->inblock += task_io_get_inblock(tsk);
- sig->oublock += task_io_get_oublock(tsk);
- task_io_accounting_add(&sig->ioac, &tsk->ioac);
- sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
}
+ /*
+ * Accumulate here the counters for all threads as they die. We could
+ * skip the group leader because it is the last user of signal_struct,
+ * but we want to avoid the race with thread_group_cputime() which can
+ * see the empty ->thread_head list.
+ */
+ task_cputime(tsk, &utime, &stime);
+ write_seqlock(&sig->stats_lock);
+ sig->utime += utime;
+ sig->stime += stime;
+ sig->gtime += task_gtime(tsk);
+ sig->min_flt += tsk->min_flt;
+ sig->maj_flt += tsk->maj_flt;
+ sig->nvcsw += tsk->nvcsw;
+ sig->nivcsw += tsk->nivcsw;
+ sig->inblock += task_io_get_inblock(tsk);
+ sig->oublock += task_io_get_oublock(tsk);
+ task_io_accounting_add(&sig->ioac, &tsk->ioac);
+ sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
__unhash_process(tsk, group_dead);
+ write_sequnlock(&sig->stats_lock);
/*
* Do this under ->siglock, we can race with another thread
@@ -214,27 +212,6 @@ repeat:
}
/*
- * This checks not only the pgrp, but falls back on the pid if no
- * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
- * without this...
- *
- * The caller must hold rcu lock or the tasklist lock.
- */
-struct pid *session_of_pgrp(struct pid *pgrp)
-{
- struct task_struct *p;
- struct pid *sid = NULL;
-
- p = pid_task(pgrp, PIDTYPE_PGID);
- if (p == NULL)
- p = pid_task(pgrp, PIDTYPE_PID);
- if (p != NULL)
- sid = task_session(p);
-
- return sid;
-}
-
-/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
* by terminal-generated stop signals. Newly orphaned process groups are
@@ -461,6 +438,44 @@ static void exit_mm(struct task_struct *tsk)
clear_thread_flag(TIF_MEMDIE);
}
+static struct task_struct *find_alive_thread(struct task_struct *p)
+{
+ struct task_struct *t;
+
+ for_each_thread(p, t) {
+ if (!(t->flags & PF_EXITING))
+ return t;
+ }
+ return NULL;
+}
+
+static struct task_struct *find_child_reaper(struct task_struct *father)
+ __releases(&tasklist_lock)
+ __acquires(&tasklist_lock)
+{
+ struct pid_namespace *pid_ns = task_active_pid_ns(father);
+ struct task_struct *reaper = pid_ns->child_reaper;
+
+ if (likely(reaper != father))
+ return reaper;
+
+ reaper = find_alive_thread(father);
+ if (reaper) {
+ pid_ns->child_reaper = reaper;
+ return reaper;
+ }
+
+ write_unlock_irq(&tasklist_lock);
+ if (unlikely(pid_ns == &init_pid_ns)) {
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ father->signal->group_exit_code ?: father->exit_code);
+ }
+ zap_pid_ns_processes(pid_ns);
+ write_lock_irq(&tasklist_lock);
+
+ return father;
+}
+
/*
* When we die, we re-parent all our children, and try to:
* 1. give them to another thread in our thread group, if such a member exists
@@ -468,58 +483,36 @@ static void exit_mm(struct task_struct *tsk)
* child_subreaper for its children (like a service manager)
* 3. give it to the init process (PID 1) in our pid namespace
*/
-static struct task_struct *find_new_reaper(struct task_struct *father)
- __releases(&tasklist_lock)
- __acquires(&tasklist_lock)
+static struct task_struct *find_new_reaper(struct task_struct *father,
+ struct task_struct *child_reaper)
{
- struct pid_namespace *pid_ns = task_active_pid_ns(father);
- struct task_struct *thread;
+ struct task_struct *thread, *reaper;
- thread = father;
- while_each_thread(father, thread) {
- if (thread->flags & PF_EXITING)
- continue;
- if (unlikely(pid_ns->child_reaper == father))
- pid_ns->child_reaper = thread;
+ thread = find_alive_thread(father);
+ if (thread)
return thread;
- }
-
- if (unlikely(pid_ns->child_reaper == father)) {
- write_unlock_irq(&tasklist_lock);
- if (unlikely(pid_ns == &init_pid_ns)) {
- panic("Attempted to kill init! exitcode=0x%08x\n",
- father->signal->group_exit_code ?:
- father->exit_code);
- }
-
- zap_pid_ns_processes(pid_ns);
- write_lock_irq(&tasklist_lock);
- } else if (father->signal->has_child_subreaper) {
- struct task_struct *reaper;
+ if (father->signal->has_child_subreaper) {
/*
- * Find the first ancestor marked as child_subreaper.
- * Note that the code below checks same_thread_group(reaper,
- * pid_ns->child_reaper). This is what we need to DTRT in a
- * PID namespace. However we still need the check above, see
- * http://marc.info/?l=linux-kernel&m=131385460420380
+ * Find the first ->is_child_subreaper ancestor in our pid_ns.
+ * We start from father to ensure we can not look into another
+ * namespace, this is safe because all its threads are dead.
*/
- for (reaper = father->real_parent;
- reaper != &init_task;
+ for (reaper = father;
+ !same_thread_group(reaper, child_reaper);
reaper = reaper->real_parent) {
- if (same_thread_group(reaper, pid_ns->child_reaper))
+ /* call_usermodehelper() descendants need this check */
+ if (reaper == &init_task)
break;
if (!reaper->signal->is_child_subreaper)
continue;
- thread = reaper;
- do {
- if (!(thread->flags & PF_EXITING))
- return reaper;
- } while_each_thread(reaper, thread);
+ thread = find_alive_thread(reaper);
+ if (thread)
+ return thread;
}
}
- return pid_ns->child_reaper;
+ return child_reaper;
}
/*
@@ -528,15 +521,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
static void reparent_leader(struct task_struct *father, struct task_struct *p,
struct list_head *dead)
{
- list_move_tail(&p->sibling, &p->real_parent->children);
-
- if (p->exit_state == EXIT_DEAD)
- return;
- /*
- * If this is a threaded reparent there is no need to
- * notify anyone anything has happened.
- */
- if (same_thread_group(p->real_parent, father))
+ if (unlikely(p->exit_state == EXIT_DEAD))
return;
/* We don't want people slaying init. */
@@ -547,49 +532,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
if (do_notify_parent(p, p->exit_signal)) {
p->exit_state = EXIT_DEAD;
- list_move_tail(&p->sibling, dead);
+ list_add(&p->ptrace_entry, dead);
}
}
kill_orphaned_pgrp(p, father);
}
-static void forget_original_parent(struct task_struct *father)
+/*
+ * This does two things:
+ *
+ * A. Make init inherit all the child processes
+ * B. Check to see if any process groups have become orphaned
+ * as a result of our exiting, and if they have any stopped
+ * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ */
+static void forget_original_parent(struct task_struct *father,
+ struct list_head *dead)
{
- struct task_struct *p, *n, *reaper;
- LIST_HEAD(dead_children);
+ struct task_struct *p, *t, *reaper;
- write_lock_irq(&tasklist_lock);
- /*
- * Note that exit_ptrace() and find_new_reaper() might
- * drop tasklist_lock and reacquire it.
- */
- exit_ptrace(father);
- reaper = find_new_reaper(father);
+ if (unlikely(!list_empty(&father->ptraced)))
+ exit_ptrace(father, dead);
- list_for_each_entry_safe(p, n, &father->children, sibling) {
- struct task_struct *t = p;
+ /* Can drop and reacquire tasklist_lock */
+ reaper = find_child_reaper(father);
+ if (list_empty(&father->children))
+ return;
- do {
+ reaper = find_new_reaper(father, reaper);
+ list_for_each_entry(p, &father->children, sibling) {
+ for_each_thread(p, t) {
t->real_parent = reaper;
- if (t->parent == father) {
- BUG_ON(t->ptrace);
+ BUG_ON((!t->ptrace) != (t->parent == father));
+ if (likely(!t->ptrace))
t->parent = t->real_parent;
- }
if (t->pdeath_signal)
group_send_sig_info(t->pdeath_signal,
SEND_SIG_NOINFO, t);
- } while_each_thread(p, t);
- reparent_leader(father, p, &dead_children);
- }
- write_unlock_irq(&tasklist_lock);
-
- BUG_ON(!list_empty(&father->children));
-
- list_for_each_entry_safe(p, n, &dead_children, sibling) {
- list_del_init(&p->sibling);
- release_task(p);
+ }
+ /*
+ * If this is a threaded reparent there is no need to
+ * notify anyone anything has happened.
+ */
+ if (!same_thread_group(reaper, father))
+ reparent_leader(father, p, dead);
}
+ list_splice_tail_init(&father->children, &reaper->children);
}
/*
@@ -599,18 +588,12 @@ static void forget_original_parent(struct task_struct *father)
static void exit_notify(struct task_struct *tsk, int group_dead)
{
bool autoreap;
-
- /*
- * This does two things:
- *
- * A. Make init inherit all the child processes
- * B. Check to see if any process groups have become orphaned
- * as a result of our exiting, and if they have any stopped
- * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
- */
- forget_original_parent(tsk);
+ struct task_struct *p, *n;
+ LIST_HEAD(dead);
write_lock_irq(&tasklist_lock);
+ forget_original_parent(tsk, &dead);
+
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -628,15 +611,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
}
tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
+ if (tsk->exit_state == EXIT_DEAD)
+ list_add(&tsk->ptrace_entry, &dead);
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
write_unlock_irq(&tasklist_lock);
- /* If the process is dead, release it - nobody will wait for it */
- if (autoreap)
- release_task(tsk);
+ list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
+ list_del_init(&p->ptrace_entry);
+ release_task(p);
+ }
}
#ifdef CONFIG_DEBUG_STACK_USAGE
@@ -667,6 +653,7 @@ void do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
+ TASKS_RCU(int tasks_rcu_i);
profile_task_exit(tsk);
@@ -775,6 +762,7 @@ void do_exit(long code)
*/
flush_ptrace_hw_breakpoint(tsk);
+ TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
exit_notify(tsk, group_dead);
proc_exit_connector(tsk);
#ifdef CONFIG_NUMA
@@ -814,6 +802,7 @@ void do_exit(long code)
if (tsk->nr_dirtied)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
+ TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
/*
* The setting of TASK_RUNNING by try_to_wake_up() may be delayed
@@ -978,8 +967,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
*/
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
- unsigned long state;
- int retval, status, traced;
+ int state, retval, status;
pid_t pid = task_pid_vnr(p);
uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
struct siginfo __user *infop;
@@ -993,6 +981,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
get_task_struct(p);
read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
+
if ((exit_code & 0x7f) == 0) {
why = CLD_EXITED;
status = exit_code >> 8;
@@ -1002,21 +992,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
}
return wait_noreap_copyout(wo, p, pid, uid, why, status);
}
-
- traced = ptrace_reparented(p);
/*
* Move the task's state to DEAD/TRACE, only one thread can do this.
*/
- state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;
+ state = (ptrace_reparented(p) && thread_group_leader(p)) ?
+ EXIT_TRACE : EXIT_DEAD;
if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
return 0;
/*
- * It can be ptraced but not reparented, check
- * thread_group_leader() to filter out sub-threads.
+ * We own this thread, nobody else can reap it.
*/
- if (likely(!traced) && thread_group_leader(p)) {
- struct signal_struct *psig;
- struct signal_struct *sig;
+ read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
+
+ /*
+ * Check thread_group_leader() to exclude the traced sub-threads.
+ */
+ if (state == EXIT_DEAD && thread_group_leader(p)) {
+ struct signal_struct *sig = p->signal;
+ struct signal_struct *psig = current->signal;
unsigned long maxrss;
cputime_t tgutime, tgstime;
@@ -1028,21 +1022,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* accumulate in the parent's signal_struct c* fields.
*
* We don't bother to take a lock here to protect these
- * p->signal fields, because they are only touched by
- * __exit_signal, which runs with tasklist_lock
- * write-locked anyway, and so is excluded here. We do
- * need to protect the access to parent->signal fields,
- * as other threads in the parent group can be right
- * here reaping other children at the same time.
+ * p->signal fields because the whole thread group is dead
+ * and nobody can change them.
+ *
+ * psig->stats_lock also protects us from our sub-theads
+ * which can reap other children at the same time. Until
+ * we change k_getrusage()-like users to rely on this lock
+ * we have to take ->siglock as well.
*
* We use thread_group_cputime_adjusted() to get times for
* the thread group, which consolidates times for all threads
* in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- spin_lock_irq(&p->real_parent->sighand->siglock);
- psig = p->real_parent->signal;
- sig = p->signal;
+ spin_lock_irq(&current->sighand->siglock);
+ write_seqlock(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1065,15 +1059,10 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
- spin_unlock_irq(&p->real_parent->sighand->siglock);
+ write_sequnlock(&psig->stats_lock);
+ spin_unlock_irq(&current->sighand->siglock);
}
- /*
- * Now we are sure this task is interesting, and no other
- * thread can reap it because we its state == DEAD/TRACE.
- */
- read_unlock(&tasklist_lock);
-
retval = wo->wo_rusage
? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
@@ -1204,6 +1193,7 @@ unlock_sig:
pid = task_pid_vnr(p);
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
if (unlikely(wo->wo_flags & WNOWAIT))
return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
@@ -1266,6 +1256,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
pid = task_pid_vnr(p);
get_task_struct(p);
read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
if (!wo->wo_info) {
retval = wo->wo_rusage
@@ -1296,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
{
+ /*
+ * We can race with wait_task_zombie() from another thread.
+ * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
+ * can't confuse the checks below.
+ */
+ int exit_state = ACCESS_ONCE(p->exit_state);
int ret;
- if (unlikely(p->exit_state == EXIT_DEAD))
+ if (unlikely(exit_state == EXIT_DEAD))
return 0;
ret = eligible_child(wo, p);
@@ -1319,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
return 0;
}
- if (unlikely(p->exit_state == EXIT_TRACE)) {
+ if (unlikely(exit_state == EXIT_TRACE)) {
/*
* ptrace == 0 means we are the natural parent. In this case
* we should clear notask_error, debugger will notify us.
@@ -1346,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
}
/* slay zombie? */
- if (p->exit_state == EXIT_ZOMBIE) {
+ if (exit_state == EXIT_ZOMBIE) {
/* we don't reap group leaders with subthreads */
if (!delay_group_leader(p)) {
/*
diff --git a/kernel/extable.c b/kernel/extable.c
index d8a6446adbcb..c98f926277a8 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,6 +18,7 @@
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/module.h>
+#include <linux/ftrace.h>
#include <linux/mutex.h>
#include <linux/init.h>
@@ -102,6 +103,8 @@ int __kernel_text_address(unsigned long addr)
return 1;
if (is_module_text_address(addr))
return 1;
+ if (is_ftrace_trampoline(addr))
+ return 1;
/*
* There might be init symbols in saved stacktraces.
* Give those symbols a chance to be printed in
@@ -119,7 +122,9 @@ int kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
- return is_module_text_address(addr);
+ if (is_module_text_address(addr))
+ return 1;
+ return is_ftrace_trampoline(addr);
}
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index a91e47d86de2..4dc2ddade9f1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst,
return 0;
}
+void set_task_stack_end_magic(struct task_struct *tsk)
+{
+ unsigned long *stackend;
+
+ stackend = end_of_stack(tsk);
+ *stackend = STACK_END_MAGIC; /* for overflow detection */
+}
+
static struct task_struct *dup_task_struct(struct task_struct *orig)
{
struct task_struct *tsk;
struct thread_info *ti;
- unsigned long *stackend;
int node = tsk_fork_get_node(orig);
int err;
@@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
- stackend = end_of_stack(tsk);
- *stackend = STACK_END_MAGIC; /* for overflow detection */
+ set_task_stack_end_magic(tsk);
#ifdef CONFIG_CC_STACKPROTECTOR
tsk->stack_canary = get_random_int();
@@ -427,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
if (tmp->vm_flags & VM_SHARED)
atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
@@ -439,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
vma_interval_tree_insert_after(tmp, mpnt,
&mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
/*
@@ -601,9 +607,8 @@ static void check_mm(struct mm_struct *mm)
printk(KERN_ALERT "BUG: Bad rss-counter state "
"mm:%p idx:%d val:%ld\n", mm, i, x);
}
-
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- VM_BUG_ON(mm->pmd_huge_pte);
+ VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}
@@ -1017,11 +1022,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
{
if (atomic_dec_and_test(&sighand->count)) {
signalfd_cleanup(sighand);
+ /*
+ * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
+ * without an RCU grace period, see __lock_task_sighand().
+ */
kmem_cache_free(sighand_cachep, sighand);
}
}
-
/*
* Initialize POSIX timer handling for a thread group.
*/
@@ -1068,6 +1076,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
+ seqlock_init(&sig->stats_lock);
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->real_timer.function = it_real_fn;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index aa6a8aadb911..a8900a3bc27a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p)
if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
return false;
+ if (test_thread_flag(TIF_MEMDIE))
+ return false;
+
if (pm_nosig_freezing || cgroup_freezing(p))
return true;
@@ -147,12 +150,6 @@ void __thaw_task(struct task_struct *p)
{
unsigned long flags;
- /*
- * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to
- * be visible to @p as waking up implies wmb. Waking up inside
- * freezer_lock also prevents wakeups from leaking outside
- * refrigerator.
- */
spin_lock_irqsave(&freezer_lock, flags);
if (frozen(p))
wake_up_process(p);
diff --git a/kernel/futex.c b/kernel/futex.c
index 815d7af2ffe8..63678b573d61 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -143,9 +143,8 @@
*
* Where (A) orders the waiters increment and the futex value read through
* atomic operations (see hb_waiters_inc) and where (B) orders the write
- * to futex and the waiters read -- this is done by the barriers in
- * get_futex_key_refs(), through either ihold or atomic_inc, depending on the
- * futex type.
+ * to futex and the waiters read -- this is done by the barriers for both
+ * shared and private futexes in get_futex_key_refs().
*
* This yields the following case (where X:=waiters, Y:=futex):
*
@@ -343,12 +342,21 @@ static void get_futex_key_refs(union futex_key *key)
case FUT_OFF_MMSHARED:
futex_get_mm(key); /* implies MB (B) */
break;
+ default:
+ /*
+ * Private futexes do not hold reference on an inode or
+ * mm, therefore the only purpose of calling get_futex_key_refs
+ * is because we need the barrier for the lockless waiter check.
+ */
+ smp_mb(); /* explicit MB (B) */
}
}
/*
* Drop a reference to the resource addressed by a key.
- * The hash bucket spinlock must not be held.
+ * The hash bucket spinlock must not be held. This is
+ * a no-op for private futexes, see comment in the get
+ * counterpart.
*/
static void drop_futex_key_refs(union futex_key *key)
{
@@ -639,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void)
return pi_state;
}
+/*
+ * Must be called with the hb lock held.
+ */
static void free_pi_state(struct futex_pi_state *pi_state)
{
+ if (!pi_state)
+ return;
+
if (!atomic_dec_and_test(&pi_state->refcount))
return;
@@ -1519,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
}
retry:
- if (pi_state != NULL) {
- /*
- * We will have to lookup the pi_state again, so free this one
- * to keep the accounting correct.
- */
- free_pi_state(pi_state);
- pi_state = NULL;
- }
-
ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
@@ -1617,6 +1622,8 @@ retry_private:
case 0:
break;
case -EFAULT:
+ free_pi_state(pi_state);
+ pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
@@ -1632,6 +1639,8 @@ retry_private:
* exit to complete.
* - The user space value changed.
*/
+ free_pi_state(pi_state);
+ pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
@@ -1708,6 +1717,7 @@ retry_private:
}
out_unlock:
+ free_pi_state(pi_state);
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
@@ -1725,8 +1735,6 @@ out_put_keys:
out_put_key1:
put_futex_key(&key1);
out:
- if (pi_state != NULL)
- free_pi_state(pi_state);
return ret ? ret : task_count;
}
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index d04ce8ac4399..c92e44855ddd 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -32,10 +32,13 @@ config GCOV_KERNEL
Note that the debugfs filesystem has to be mounted to access
profiling data.
+config ARCH_HAS_GCOV_PROFILE_ALL
+ def_bool n
+
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
- depends on SUPERH || S390 || X86 || PPC || MICROBLAZE
+ depends on ARCH_HAS_GCOV_PROFILE_ALL
default n
---help---
This options activates profiling for the entire kernel.
diff --git a/kernel/groups.c b/kernel/groups.c
index 451698f86cfa..664411f171b5 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/user_namespace.h>
#include <asm/uaccess.h>
/* init to 2 - one for init_task, one to ensure it is never freed */
@@ -213,6 +214,14 @@ out:
return i;
}
+bool may_setgroups(void)
+{
+ struct user_namespace *user_ns = current_user_ns();
+
+ return ns_capable(user_ns, CAP_SETGID) &&
+ userns_may_setgroups(user_ns);
+}
+
/*
* SMP: Our groups are copy-on-write. We can set them safely
* without another task interfering.
@@ -223,7 +232,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!ns_capable(current_user_ns(), CAP_SETGID))
+ if (!may_setgroups())
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d269cecdfbf0..9a76e3beda54 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -55,6 +55,24 @@ config GENERIC_IRQ_CHIP
config IRQ_DOMAIN
bool
+# Support for hierarchical irq domains
+config IRQ_DOMAIN_HIERARCHY
+ bool
+ select IRQ_DOMAIN
+
+# Generic MSI interrupt support
+config GENERIC_MSI_IRQ
+ bool
+
+# Generic MSI hierarchical interrupt domain support
+config GENERIC_MSI_IRQ_DOMAIN
+ bool
+ select IRQ_DOMAIN_HIERARCHY
+ select GENERIC_MSI_IRQ
+
+config HANDLE_DOMAIN_IRQ
+ bool
+
config IRQ_DOMAIN_DEBUG
bool "Expose hardware/virtual IRQ mapping via debugfs"
depends on IRQ_DOMAIN && DEBUG_FS
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index fff17381f0af..d12123526e2b 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
obj-$(CONFIG_PM_SLEEP) += pm.o
+obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6223fab9a9d2..6f1c7a566b95 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <linux/irqdomain.h>
#include <trace/events/irq.h>
@@ -178,6 +179,7 @@ int irq_startup(struct irq_desc *desc, bool resend)
irq_state_clr_disabled(desc);
desc->depth = 0;
+ irq_domain_activate_irq(&desc->irq_data);
if (desc->irq_data.chip->irq_startup) {
ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
irq_state_clr_masked(desc);
@@ -199,6 +201,7 @@ void irq_shutdown(struct irq_desc *desc)
desc->irq_data.chip->irq_disable(&desc->irq_data);
else
desc->irq_data.chip->irq_mask(&desc->irq_data);
+ irq_domain_deactivate_irq(&desc->irq_data);
irq_state_set_masked(desc);
}
@@ -342,6 +345,31 @@ static bool irq_check_poll(struct irq_desc *desc)
return irq_wait_for_poll(desc);
}
+static bool irq_may_run(struct irq_desc *desc)
+{
+ unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED;
+
+ /*
+ * If the interrupt is not in progress and is not an armed
+ * wakeup interrupt, proceed.
+ */
+ if (!irqd_has_set(&desc->irq_data, mask))
+ return true;
+
+ /*
+ * If the interrupt is an armed wakeup source, mark it pending
+ * and suspended, disable it and notify the pm core about the
+ * event.
+ */
+ if (irq_pm_check_wakeup(desc))
+ return false;
+
+ /*
+ * Handle a potential concurrent poll on a different core.
+ */
+ return irq_check_poll(desc);
+}
+
/**
* handle_simple_irq - Simple and software-decoded IRQs.
* @irq: the interrupt number
@@ -359,9 +387,8 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);
- if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
- if (!irq_check_poll(desc))
- goto out_unlock;
+ if (!irq_may_run(desc))
+ goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
@@ -412,9 +439,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
mask_ack_irq(desc);
- if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
- if (!irq_check_poll(desc))
- goto out_unlock;
+ if (!irq_may_run(desc))
+ goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
@@ -485,9 +511,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
- if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
- if (!irq_check_poll(desc))
- goto out;
+ if (!irq_may_run(desc))
+ goto out;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
@@ -541,19 +566,23 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (!irq_may_run(desc)) {
+ desc->istate |= IRQS_PENDING;
+ mask_ack_irq(desc);
+ goto out_unlock;
+ }
+
/*
- * If we're currently running this IRQ, or its disabled,
- * we shouldn't process the IRQ. Mark it pending, handle
- * the necessary masking and go out
+ * If its disabled or no action available then mask it and get
+ * out of here.
*/
- if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
- irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
- if (!irq_check_poll(desc)) {
- desc->istate |= IRQS_PENDING;
- mask_ack_irq(desc);
- goto out_unlock;
- }
+ if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
+ desc->istate |= IRQS_PENDING;
+ mask_ack_irq(desc);
+ goto out_unlock;
}
+
kstat_incr_irqs_this_cpu(irq, desc);
/* Start handling the irq */
@@ -602,18 +631,21 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (!irq_may_run(desc)) {
+ desc->istate |= IRQS_PENDING;
+ goto out_eoi;
+ }
+
/*
- * If we're currently running this IRQ, or its disabled,
- * we shouldn't process the IRQ. Mark it pending, handle
- * the necessary masking and go out
+ * If its disabled or no action available then mask it and get
+ * out of here.
*/
- if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
- irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
- if (!irq_check_poll(desc)) {
- desc->istate |= IRQS_PENDING;
- goto out_eoi;
- }
+ if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
+ desc->istate |= IRQS_PENDING;
+ goto out_eoi;
}
+
kstat_incr_irqs_this_cpu(irq, desc);
do {
@@ -670,7 +702,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
struct irqaction *action = desc->action;
- void *dev_id = __this_cpu_ptr(action->percpu_dev_id);
+ void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
irqreturn_t res;
kstat_incr_irqs_this_cpu(irq, desc);
@@ -699,7 +731,30 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
if (!handle) {
handle = handle_bad_irq;
} else {
- if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
+ struct irq_data *irq_data = &desc->irq_data;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ /*
+ * With hierarchical domains we might run into a
+ * situation where the outermost chip is not yet set
+ * up, but the inner chips are there. Instead of
+ * bailing we install the handler, but obviously we
+ * cannot enable/startup the interrupt at this point.
+ */
+ while (irq_data) {
+ if (irq_data->chip != &no_irq_chip)
+ break;
+ /*
+ * Bail out if the outer chip is not set up
+ * and the interrrupt supposed to be started
+ * right away.
+ */
+ if (WARN_ON(is_chained))
+ goto out;
+ /* Try the parent */
+ irq_data = irq_data->parent_data;
+ }
+#endif
+ if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip))
goto out;
}
@@ -818,3 +873,105 @@ void irq_cpu_offline(void)
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+/**
+ * irq_chip_ack_parent - Acknowledge the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_ack_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_ack(data);
+}
+
+/**
+ * irq_chip_mask_parent - Mask the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_mask_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_mask(data);
+}
+
+/**
+ * irq_chip_unmask_parent - Unmask the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_unmask_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_unmask(data);
+}
+
+/**
+ * irq_chip_eoi_parent - Invoke EOI on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_eoi_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_eoi(data);
+}
+
+/**
+ * irq_chip_set_affinity_parent - Set affinity on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @dest: The affinity mask to set
+ * @force: Flag to enforce setting (disable online checks)
+ *
+ * Conditinal, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_affinity_parent(struct irq_data *data,
+ const struct cpumask *dest, bool force)
+{
+ data = data->parent_data;
+ if (data->chip->irq_set_affinity)
+ return data->chip->irq_set_affinity(data, dest, force);
+
+ return -ENOSYS;
+}
+
+/**
+ * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
+ * @data: Pointer to interrupt specific data
+ *
+ * Iterate through the domain hierarchy of the interrupt and check
+ * whether a hw retrigger function exists. If yes, invoke it.
+ */
+int irq_chip_retrigger_hierarchy(struct irq_data *data)
+{
+ for (data = data->parent_data; data; data = data->parent_data)
+ if (data->chip && data->chip->irq_retrigger)
+ return data->chip->irq_retrigger(data);
+
+ return -ENOSYS;
+}
+#endif
+
+/**
+ * irq_chip_compose_msi_msg - Componse msi message for a irq chip
+ * @data: Pointer to interrupt specific data
+ * @msg: Pointer to the MSI message
+ *
+ * For hierarchical domains we find the first chip in the hierarchy
+ * which implements the irq_compose_msi_msg callback. For non
+ * hierarchical we use the top level chip.
+ */
+int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ struct irq_data *pos = NULL;
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ for (; data; data = data->parent_data)
+#endif
+ if (data->chip && data->chip->irq_compose_msi_msg)
+ pos = data;
+ if (!pos)
+ return -ENOSYS;
+
+ pos->chip->irq_compose_msi_msg(pos, msg);
+
+ return 0;
+}
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 1ef0606797c9..d5d0f7345c54 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -38,7 +38,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
*
* Except for the extra @dev argument, this function takes the
* same arguments and performs the same function as
- * request_irq(). IRQs requested with this function will be
+ * request_threaded_irq(). IRQs requested with this function will be
* automatically freed on driver detach.
*
* If an IRQ allocated with this function needs to be freed
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index cf80e7b0ddab..61024e8abdef 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.disable);
+ irq_reg_writel(gc, mask, ct->regs.disable);
*ct->mask_cache &= ~mask;
irq_gc_unlock(gc);
}
@@ -59,7 +59,7 @@ void irq_gc_mask_set_bit(struct irq_data *d)
irq_gc_lock(gc);
*ct->mask_cache |= mask;
- irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
+ irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
@@ -79,7 +79,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
irq_gc_lock(gc);
*ct->mask_cache &= ~mask;
- irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
+ irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
@@ -98,7 +98,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.enable);
+ irq_reg_writel(gc, mask, ct->regs.enable);
*ct->mask_cache |= mask;
irq_gc_unlock(gc);
}
@@ -114,7 +114,7 @@ void irq_gc_ack_set_bit(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
+ irq_reg_writel(gc, mask, ct->regs.ack);
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
@@ -130,7 +130,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
u32 mask = ~d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
+ irq_reg_writel(gc, mask, ct->regs.ack);
irq_gc_unlock(gc);
}
@@ -145,8 +145,8 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.mask);
- irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
+ irq_reg_writel(gc, mask, ct->regs.mask);
+ irq_reg_writel(gc, mask, ct->regs.ack);
irq_gc_unlock(gc);
}
@@ -161,7 +161,7 @@ void irq_gc_eoi(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.eoi);
+ irq_reg_writel(gc, mask, ct->regs.eoi);
irq_gc_unlock(gc);
}
@@ -191,6 +191,16 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
return 0;
}
+static u32 irq_readl_be(void __iomem *addr)
+{
+ return ioread32be(addr);
+}
+
+static void irq_writel_be(u32 val, void __iomem *addr)
+{
+ iowrite32be(val, addr);
+}
+
static void
irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
int num_ct, unsigned int irq_base,
@@ -245,7 +255,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
}
ct[i].mask_cache = mskptr;
if (flags & IRQ_GC_INIT_MASK_CACHE)
- *mskptr = irq_reg_readl(gc->reg_base + mskreg);
+ *mskptr = irq_reg_readl(gc, mskreg);
}
}
@@ -300,7 +310,13 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
dgc->gc[i] = gc = tmp;
irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
NULL, handler);
+
gc->domain = d;
+ if (gcflags & IRQ_GC_BE_IO) {
+ gc->reg_readl = &irq_readl_be;
+ gc->reg_writel = &irq_writel_be;
+ }
+
raw_spin_lock_irqsave(&gc_lock, flags);
list_add_tail(&gc->list, &gc_list);
raw_spin_unlock_irqrestore(&gc_lock, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 099ea2e0eb88..df553b0af936 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -63,8 +63,8 @@ enum {
extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unsigned long flags);
-extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
-extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
+extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
+extern void __enable_irq(struct irq_desc *desc, unsigned int irq);
extern int irq_startup(struct irq_desc *desc, bool resend);
extern void irq_shutdown(struct irq_desc *desc);
@@ -78,8 +78,12 @@ extern void unmask_threaded_irq(struct irq_desc *desc);
#ifdef CONFIG_SPARSE_IRQ
static inline void irq_mark_irq(unsigned int irq) { }
+extern void irq_lock_sparse(void);
+extern void irq_unlock_sparse(void);
#else
extern void irq_mark_irq(unsigned int irq);
+static inline void irq_lock_sparse(void) { }
+static inline void irq_unlock_sparse(void) { }
#endif
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
@@ -194,3 +198,15 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d
__this_cpu_inc(*desc->kstat_irqs);
__this_cpu_inc(kstat.irqs_sum);
}
+
+#ifdef CONFIG_PM_SLEEP
+bool irq_pm_check_wakeup(struct irq_desc *desc);
+void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
+void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action);
+#else
+static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; }
+static inline void
+irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
+static inline void
+irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
+#endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1487a123db5c..99793b9b6d23 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -14,6 +14,7 @@
#include <linux/kernel_stat.h>
#include <linux/radix-tree.h>
#include <linux/bitmap.h>
+#include <linux/irqdomain.h>
#include "internals.h"
@@ -131,6 +132,16 @@ static void free_masks(struct irq_desc *desc)
static inline void free_masks(struct irq_desc *desc) { }
#endif
+void irq_lock_sparse(void)
+{
+ mutex_lock(&sparse_irq_lock);
+}
+
+void irq_unlock_sparse(void)
+{
+ mutex_unlock(&sparse_irq_lock);
+}
+
static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
{
struct irq_desc *desc;
@@ -167,6 +178,12 @@ static void free_desc(unsigned int irq)
unregister_irq_proc(irq, desc);
+ /*
+ * sparse_irq_lock protects also show_interrupts() and
+ * kstat_irq_usr(). Once we deleted the descriptor from the
+ * sparse tree we can free it. Access in proc will fail to
+ * lookup the descriptor.
+ */
mutex_lock(&sparse_irq_lock);
delete_irq_desc(irq);
mutex_unlock(&sparse_irq_lock);
@@ -336,6 +353,47 @@ int generic_handle_irq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(generic_handle_irq);
+#ifdef CONFIG_HANDLE_DOMAIN_IRQ
+/**
+ * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ * @lookup: Whether to perform the domain lookup or not
+ * @regs: Register file coming from the low-level handling code
+ *
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ */
+int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
+ bool lookup, struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ unsigned int irq = hwirq;
+ int ret = 0;
+
+ irq_enter();
+
+#ifdef CONFIG_IRQ_DOMAIN
+ if (lookup)
+ irq = irq_find_mapping(domain, hwirq);
+#endif
+
+ /*
+ * Some hardware gives randomly wrong interrupts. Rather
+ * than crashing, do something sensible.
+ */
+ if (unlikely(!irq || irq >= nr_irqs)) {
+ ack_bad_irq(irq);
+ ret = -EINVAL;
+ } else {
+ generic_handle_irq(irq);
+ }
+
+ irq_exit();
+ set_irq_regs(old_regs);
+ return ret;
+}
+#endif
+
/* Dynamic interrupt handling */
/**
@@ -532,6 +590,15 @@ void kstat_incr_irq_this_cpu(unsigned int irq)
kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
}
+/**
+ * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu
+ * @irq: The interrupt number
+ * @cpu: The cpu number
+ *
+ * Returns the sum of interrupt counts on @cpu since boot for
+ * @irq. The caller must ensure that the interrupt is not removed
+ * concurrently.
+ */
unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -540,6 +607,14 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
*per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
}
+/**
+ * kstat_irqs - Get the statistics for an interrupt
+ * @irq: The interrupt number
+ *
+ * Returns the sum of interrupt counts on all cpus since boot for
+ * @irq. The caller must ensure that the interrupt is not removed
+ * concurrently.
+ */
unsigned int kstat_irqs(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -552,3 +627,22 @@ unsigned int kstat_irqs(unsigned int irq)
sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
return sum;
}
+
+/**
+ * kstat_irqs_usr - Get the statistics for an interrupt
+ * @irq: The interrupt number
+ *
+ * Returns the sum of interrupt counts on all cpus since boot for
+ * @irq. Contrary to kstat_irqs() this can be called from any
+ * preemptible context. It's protected against concurrent removal of
+ * an interrupt descriptor when sparse irqs are enabled.
+ */
+unsigned int kstat_irqs_usr(unsigned int irq)
+{
+ int sum;
+
+ irq_lock_sparse();
+ sum = kstat_irqs(irq);
+ irq_unlock_sparse();
+ return sum;
+}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 6534ff6ce02e..7fac311057b8 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -23,6 +23,10 @@ static DEFINE_MUTEX(irq_domain_mutex);
static DEFINE_MUTEX(revmap_trees_mutex);
static struct irq_domain *irq_default_domain;
+static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
+ irq_hw_number_t hwirq, int node);
+static void irq_domain_check_hierarchy(struct irq_domain *domain);
+
/**
* __irq_domain_add() - Allocate a new irq_domain data structure
* @of_node: optional device-tree node of the interrupt controller
@@ -30,7 +34,7 @@ static struct irq_domain *irq_default_domain;
* @hwirq_max: Maximum number of interrupts supported by controller
* @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
* direct mapping
- * @ops: map/unmap domain callbacks
+ * @ops: domain callbacks
* @host_data: Controller private data pointer
*
* Allocates and initialize and irq_domain structure.
@@ -56,6 +60,7 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
domain->hwirq_max = hwirq_max;
domain->revmap_size = size;
domain->revmap_direct_max_irq = direct_max;
+ irq_domain_check_hierarchy(domain);
mutex_lock(&irq_domain_mutex);
list_add(&domain->link, &irq_domain_list);
@@ -109,7 +114,7 @@ EXPORT_SYMBOL_GPL(irq_domain_remove);
* @first_irq: first number of irq block assigned to the domain,
* pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
* pre-map all of the irqs in the domain to virqs starting at first_irq.
- * @ops: map/unmap domain callbacks
+ * @ops: domain callbacks
* @host_data: Controller private data pointer
*
* Allocates an irq_domain, and optionally if first_irq is positive then also
@@ -174,10 +179,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
domain = __irq_domain_add(of_node, first_hwirq + size,
first_hwirq + size, 0, ops, host_data);
- if (!domain)
- return NULL;
-
- irq_domain_associate_many(domain, first_irq, first_hwirq, size);
+ if (domain)
+ irq_domain_associate_many(domain, first_irq, first_hwirq, size);
return domain;
}
@@ -388,7 +391,6 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
unsigned int irq_create_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
- unsigned int hint;
int virq;
pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
@@ -410,12 +412,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
}
/* Allocate a virtual interrupt number */
- hint = hwirq % nr_irqs;
- if (hint == 0)
- hint++;
- virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node));
- if (virq <= 0)
- virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
+ virq = irq_domain_alloc_descs(-1, 1, hwirq,
+ of_node_to_nid(domain->of_node));
if (virq <= 0) {
pr_debug("-> virq allocation failed\n");
return 0;
@@ -471,7 +469,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
struct irq_domain *domain;
irq_hw_number_t hwirq;
unsigned int type = IRQ_TYPE_NONE;
- unsigned int virq;
+ int virq;
domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
if (!domain) {
@@ -489,10 +487,24 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
return 0;
}
- /* Create mapping */
- virq = irq_create_mapping(domain, hwirq);
- if (!virq)
- return virq;
+ if (irq_domain_is_hierarchy(domain)) {
+ /*
+ * If we've already configured this interrupt,
+ * don't do it again, or hell will break loose.
+ */
+ virq = irq_find_mapping(domain, hwirq);
+ if (virq)
+ return virq;
+
+ virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data);
+ if (virq <= 0)
+ return 0;
+ } else {
+ /* Create mapping */
+ virq = irq_create_mapping(domain, hwirq);
+ if (!virq)
+ return virq;
+ }
/* Set type if specified and different than the current one */
if (type != IRQ_TYPE_NONE &&
@@ -540,8 +552,8 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
return 0;
if (hwirq < domain->revmap_direct_max_irq) {
- data = irq_get_irq_data(hwirq);
- if (data && (data->domain == domain) && (data->hwirq == hwirq))
+ data = irq_domain_get_irq_data(domain, hwirq);
+ if (data && data->hwirq == hwirq)
return hwirq;
}
@@ -709,3 +721,518 @@ const struct irq_domain_ops irq_domain_simple_ops = {
.xlate = irq_domain_xlate_onetwocell,
};
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
+
+static int irq_domain_alloc_descs(int virq, unsigned int cnt,
+ irq_hw_number_t hwirq, int node)
+{
+ unsigned int hint;
+
+ if (virq >= 0) {
+ virq = irq_alloc_descs(virq, virq, cnt, node);
+ } else {
+ hint = hwirq % nr_irqs;
+ if (hint == 0)
+ hint++;
+ virq = irq_alloc_descs_from(hint, cnt, node);
+ if (virq <= 0 && hint > 1)
+ virq = irq_alloc_descs_from(1, cnt, node);
+ }
+
+ return virq;
+}
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+/**
+ * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy
+ * @parent: Parent irq domain to associate with the new domain
+ * @flags: Irq domain flags associated to the domain
+ * @size: Size of the domain. See below
+ * @node: Optional device-tree node of the interrupt controller
+ * @ops: Pointer to the interrupt domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * If @size is 0 a tree domain is created, otherwise a linear domain.
+ *
+ * If successful the parent is associated to the new domain and the
+ * domain flags are set.
+ * Returns pointer to IRQ domain, or NULL on failure.
+ */
+struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent,
+ unsigned int flags,
+ unsigned int size,
+ struct device_node *node,
+ const struct irq_domain_ops *ops,
+ void *host_data)
+{
+ struct irq_domain *domain;
+
+ if (size)
+ domain = irq_domain_add_linear(node, size, ops, host_data);
+ else
+ domain = irq_domain_add_tree(node, ops, host_data);
+ if (domain) {
+ domain->parent = parent;
+ domain->flags |= flags;
+ }
+
+ return domain;
+}
+
+static void irq_domain_insert_irq(int virq)
+{
+ struct irq_data *data;
+
+ for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
+ struct irq_domain *domain = data->domain;
+ irq_hw_number_t hwirq = data->hwirq;
+
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = virq;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_insert(&domain->revmap_tree, hwirq, data);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+
+ /* If not already assigned, give the domain the chip's name */
+ if (!domain->name && data->chip)
+ domain->name = data->chip->name;
+ }
+
+ irq_clear_status_flags(virq, IRQ_NOREQUEST);
+}
+
+static void irq_domain_remove_irq(int virq)
+{
+ struct irq_data *data;
+
+ irq_set_status_flags(virq, IRQ_NOREQUEST);
+ irq_set_chip_and_handler(virq, NULL, NULL);
+ synchronize_irq(virq);
+ smp_mb();
+
+ for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
+ struct irq_domain *domain = data->domain;
+ irq_hw_number_t hwirq = data->hwirq;
+
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = 0;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_delete(&domain->revmap_tree, hwirq);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+ }
+}
+
+static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
+ struct irq_data *child)
+{
+ struct irq_data *irq_data;
+
+ irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node);
+ if (irq_data) {
+ child->parent_data = irq_data;
+ irq_data->irq = child->irq;
+ irq_data->node = child->node;
+ irq_data->domain = domain;
+ }
+
+ return irq_data;
+}
+
+static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *irq_data, *tmp;
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_get_irq_data(virq + i);
+ tmp = irq_data->parent_data;
+ irq_data->parent_data = NULL;
+ irq_data->domain = NULL;
+
+ while (tmp) {
+ irq_data = tmp;
+ tmp = tmp->parent_data;
+ kfree(irq_data);
+ }
+ }
+}
+
+static int irq_domain_alloc_irq_data(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *irq_data;
+ struct irq_domain *parent;
+ int i;
+
+ /* The outermost irq_data is embedded in struct irq_desc */
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_get_irq_data(virq + i);
+ irq_data->domain = domain;
+
+ for (parent = domain->parent; parent; parent = parent->parent) {
+ irq_data = irq_domain_insert_irq_data(parent, irq_data);
+ if (!irq_data) {
+ irq_domain_free_irq_data(virq, i + 1);
+ return -ENOMEM;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
+ * @domain: domain to match
+ * @virq: IRQ number to get irq_data
+ */
+struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
+ unsigned int virq)
+{
+ struct irq_data *irq_data;
+
+ for (irq_data = irq_get_irq_data(virq); irq_data;
+ irq_data = irq_data->parent_data)
+ if (irq_data->domain == domain)
+ return irq_data;
+
+ return NULL;
+}
+
+/**
+ * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number
+ * @hwirq: The hwirq number
+ * @chip: The associated interrupt chip
+ * @chip_data: The associated chip data
+ */
+int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq, struct irq_chip *chip,
+ void *chip_data)
+{
+ struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+
+ if (!irq_data)
+ return -ENOENT;
+
+ irq_data->hwirq = hwirq;
+ irq_data->chip = chip ? chip : &no_irq_chip;
+ irq_data->chip_data = chip_data;
+
+ return 0;
+}
+
+/**
+ * irq_domain_set_info - Set the complete data for a @virq in @domain
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number
+ * @hwirq: The hardware interrupt number
+ * @chip: The associated interrupt chip
+ * @chip_data: The associated interrupt chip data
+ * @handler: The interrupt flow handler
+ * @handler_data: The interrupt flow handler data
+ * @handler_name: The interrupt handler name
+ */
+void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq, struct irq_chip *chip,
+ void *chip_data, irq_flow_handler_t handler,
+ void *handler_data, const char *handler_name)
+{
+ irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data);
+ __irq_set_handler(virq, handler, 0, handler_name);
+ irq_set_handler_data(virq, handler_data);
+}
+
+/**
+ * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
+ * @irq_data: The pointer to irq_data
+ */
+void irq_domain_reset_irq_data(struct irq_data *irq_data)
+{
+ irq_data->hwirq = 0;
+ irq_data->chip = &no_irq_chip;
+ irq_data->chip_data = NULL;
+}
+
+/**
+ * irq_domain_free_irqs_common - Clear irq_data and free the parent
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number to start with
+ * @nr_irqs: The number of irqs to free
+ */
+void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct irq_data *irq_data;
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_domain_get_irq_data(domain, virq + i);
+ if (irq_data)
+ irq_domain_reset_irq_data(irq_data);
+ }
+ irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+}
+
+/**
+ * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number to start with
+ * @nr_irqs: The number of irqs to free
+ */
+void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_set_handler_data(virq + i, NULL);
+ irq_set_handler(virq + i, NULL);
+ }
+ irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static bool irq_domain_is_auto_recursive(struct irq_domain *domain)
+{
+ return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE;
+}
+
+static void irq_domain_free_irqs_recursive(struct irq_domain *domain,
+ unsigned int irq_base,
+ unsigned int nr_irqs)
+{
+ domain->ops->free(domain, irq_base, nr_irqs);
+ if (irq_domain_is_auto_recursive(domain)) {
+ BUG_ON(!domain->parent);
+ irq_domain_free_irqs_recursive(domain->parent, irq_base,
+ nr_irqs);
+ }
+}
+
+static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
+ unsigned int irq_base,
+ unsigned int nr_irqs, void *arg)
+{
+ int ret = 0;
+ struct irq_domain *parent = domain->parent;
+ bool recursive = irq_domain_is_auto_recursive(domain);
+
+ BUG_ON(recursive && !parent);
+ if (recursive)
+ ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
+ nr_irqs, arg);
+ if (ret >= 0)
+ ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
+ if (ret < 0 && recursive)
+ irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
+
+ return ret;
+}
+
+/**
+ * __irq_domain_alloc_irqs - Allocate IRQs from domain
+ * @domain: domain to allocate from
+ * @irq_base: allocate specified IRQ nubmer if irq_base >= 0
+ * @nr_irqs: number of IRQs to allocate
+ * @node: NUMA node id for memory allocation
+ * @arg: domain specific argument
+ * @realloc: IRQ descriptors have already been allocated if true
+ *
+ * Allocate IRQ numbers and initialized all data structures to support
+ * hierarchy IRQ domains.
+ * Parameter @realloc is mainly to support legacy IRQs.
+ * Returns error code or allocated IRQ number
+ *
+ * The whole process to setup an IRQ has been split into two steps.
+ * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
+ * descriptor and required hardware resources. The second step,
+ * irq_domain_activate_irq(), is to program hardwares with preallocated
+ * resources. In this way, it's easier to rollback when failing to
+ * allocate resources.
+ */
+int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc)
+{
+ int i, ret, virq;
+
+ if (domain == NULL) {
+ domain = irq_default_domain;
+ if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
+ return -EINVAL;
+ }
+
+ if (!domain->ops->alloc) {
+ pr_debug("domain->ops->alloc() is NULL\n");
+ return -ENOSYS;
+ }
+
+ if (realloc && irq_base >= 0) {
+ virq = irq_base;
+ } else {
+ virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node);
+ if (virq < 0) {
+ pr_debug("cannot allocate IRQ(base %d, count %d)\n",
+ irq_base, nr_irqs);
+ return virq;
+ }
+ }
+
+ if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) {
+ pr_debug("cannot allocate memory for IRQ%d\n", virq);
+ ret = -ENOMEM;
+ goto out_free_desc;
+ }
+
+ mutex_lock(&irq_domain_mutex);
+ ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg);
+ if (ret < 0) {
+ mutex_unlock(&irq_domain_mutex);
+ goto out_free_irq_data;
+ }
+ for (i = 0; i < nr_irqs; i++)
+ irq_domain_insert_irq(virq + i);
+ mutex_unlock(&irq_domain_mutex);
+
+ return virq;
+
+out_free_irq_data:
+ irq_domain_free_irq_data(virq, nr_irqs);
+out_free_desc:
+ irq_free_descs(virq, nr_irqs);
+ return ret;
+}
+
+/**
+ * irq_domain_free_irqs - Free IRQ number and associated data structures
+ * @virq: base IRQ number
+ * @nr_irqs: number of IRQs to free
+ */
+void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *data = irq_get_irq_data(virq);
+ int i;
+
+ if (WARN(!data || !data->domain || !data->domain->ops->free,
+ "NULL pointer, cannot free irq\n"))
+ return;
+
+ mutex_lock(&irq_domain_mutex);
+ for (i = 0; i < nr_irqs; i++)
+ irq_domain_remove_irq(virq + i);
+ irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs);
+ mutex_unlock(&irq_domain_mutex);
+
+ irq_domain_free_irq_data(virq, nr_irqs);
+ irq_free_descs(virq, nr_irqs);
+}
+
+/**
+ * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain
+ * @irq_base: Base IRQ number
+ * @nr_irqs: Number of IRQs to allocate
+ * @arg: Allocation data (arch/domain specific)
+ *
+ * Check whether the domain has been setup recursive. If not allocate
+ * through the parent domain.
+ */
+int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
+ unsigned int irq_base, unsigned int nr_irqs,
+ void *arg)
+{
+ /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */
+ if (irq_domain_is_auto_recursive(domain))
+ return 0;
+
+ domain = domain->parent;
+ if (domain)
+ return irq_domain_alloc_irqs_recursive(domain, irq_base,
+ nr_irqs, arg);
+ return -ENOSYS;
+}
+
+/**
+ * irq_domain_free_irqs_parent - Free interrupts from parent domain
+ * @irq_base: Base IRQ number
+ * @nr_irqs: Number of IRQs to free
+ *
+ * Check whether the domain has been setup recursive. If not free
+ * through the parent domain.
+ */
+void irq_domain_free_irqs_parent(struct irq_domain *domain,
+ unsigned int irq_base, unsigned int nr_irqs)
+{
+ /* irq_domain_free_irqs_recursive() will call parent's free */
+ if (!irq_domain_is_auto_recursive(domain) && domain->parent)
+ irq_domain_free_irqs_recursive(domain->parent, irq_base,
+ nr_irqs);
+}
+
+/**
+ * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
+ * interrupt
+ * @irq_data: outermost irq_data associated with interrupt
+ *
+ * This is the second step to call domain_ops->activate to program interrupt
+ * controllers, so the interrupt could actually get delivered.
+ */
+void irq_domain_activate_irq(struct irq_data *irq_data)
+{
+ if (irq_data && irq_data->domain) {
+ struct irq_domain *domain = irq_data->domain;
+
+ if (irq_data->parent_data)
+ irq_domain_activate_irq(irq_data->parent_data);
+ if (domain->ops->activate)
+ domain->ops->activate(domain, irq_data);
+ }
+}
+
+/**
+ * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to
+ * deactivate interrupt
+ * @irq_data: outermost irq_data associated with interrupt
+ *
+ * It calls domain_ops->deactivate to program interrupt controllers to disable
+ * interrupt delivery.
+ */
+void irq_domain_deactivate_irq(struct irq_data *irq_data)
+{
+ if (irq_data && irq_data->domain) {
+ struct irq_domain *domain = irq_data->domain;
+
+ if (domain->ops->deactivate)
+ domain->ops->deactivate(domain, irq_data);
+ if (irq_data->parent_data)
+ irq_domain_deactivate_irq(irq_data->parent_data);
+ }
+}
+
+static void irq_domain_check_hierarchy(struct irq_domain *domain)
+{
+ /* Hierarchy irq_domains must implement callback alloc() */
+ if (domain->ops->alloc)
+ domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
+}
+#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
+/**
+ * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
+ * @domain: domain to match
+ * @virq: IRQ number to get irq_data
+ */
+struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
+ unsigned int virq)
+{
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+
+ return (irq_data && irq_data->domain == domain) ? irq_data : NULL;
+}
+
+static void irq_domain_check_hierarchy(struct irq_domain *domain)
+{
+}
+#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3dc6a61bf06a..80692373abd6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -183,6 +183,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
ret = chip->irq_set_affinity(data, mask, force);
switch (ret) {
case IRQ_SET_MASK_OK:
+ case IRQ_SET_MASK_OK_DONE:
cpumask_copy(data->affinity, mask);
case IRQ_SET_MASK_OK_NOCOPY:
irq_set_thread_affinity(desc);
@@ -382,14 +383,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
}
#endif
-void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
+void __disable_irq(struct irq_desc *desc, unsigned int irq)
{
- if (suspend) {
- if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
- return;
- desc->istate |= IRQS_SUSPENDED;
- }
-
if (!desc->depth++)
irq_disable(desc);
}
@@ -401,7 +396,7 @@ static int __disable_irq_nosync(unsigned int irq)
if (!desc)
return -EINVAL;
- __disable_irq(desc, irq, false);
+ __disable_irq(desc, irq);
irq_put_desc_busunlock(desc, flags);
return 0;
}
@@ -442,20 +437,8 @@ void disable_irq(unsigned int irq)
}
EXPORT_SYMBOL(disable_irq);
-void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
+void __enable_irq(struct irq_desc *desc, unsigned int irq)
{
- if (resume) {
- if (!(desc->istate & IRQS_SUSPENDED)) {
- if (!desc->action)
- return;
- if (!(desc->action->flags & IRQF_FORCE_RESUME))
- return;
- /* Pretend that it got disabled ! */
- desc->depth++;
- }
- desc->istate &= ~IRQS_SUSPENDED;
- }
-
switch (desc->depth) {
case 0:
err_out:
@@ -497,7 +480,7 @@ void enable_irq(unsigned int irq)
KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
goto out;
- __enable_irq(desc, irq, false);
+ __enable_irq(desc, irq);
out:
irq_put_desc_busunlock(desc, flags);
}
@@ -618,6 +601,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
switch (ret) {
case IRQ_SET_MASK_OK:
+ case IRQ_SET_MASK_OK_DONE:
irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
irqd_set(&desc->irq_data, flags);
@@ -1218,6 +1202,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
new->irq = irq;
*old_ptr = new;
+ irq_pm_install_action(desc, new);
+
/* Reset broken irq detection when installing new handler */
desc->irq_count = 0;
desc->irqs_unhandled = 0;
@@ -1228,7 +1214,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
desc->istate &= ~IRQS_SPURIOUS_DISABLED;
- __enable_irq(desc, irq, false);
+ __enable_irq(desc, irq);
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -1336,6 +1322,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
/* Found it - now remove it from the list of entries: */
*action_ptr = action->next;
+ irq_pm_remove_action(desc, action);
+
/* If this was the last handler, shut down the IRQ line: */
if (!desc->action) {
irq_shutdown(desc);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
new file mode 100644
index 000000000000..3e18163f336f
--- /dev/null
+++ b/kernel/irq/msi.c
@@ -0,0 +1,330 @@
+/*
+ * linux/kernel/irq/msi.c
+ *
+ * Copyright (C) 2014 Intel Corp.
+ * Author: Jiang Liu <jiang.liu@linux.intel.com>
+ *
+ * This file is licensed under GPLv2.
+ *
+ * This file contains common code to support Message Signalled Interrupt for
+ * PCI compatible and non PCI compatible devices.
+ */
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/msi.h>
+
+/* Temparory solution for building, will be removed later */
+#include <linux/pci.h>
+
+void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
+{
+ *msg = entry->msg;
+}
+
+void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
+{
+ struct msi_desc *entry = irq_get_msi_desc(irq);
+
+ __get_cached_msi_msg(entry, msg);
+}
+EXPORT_SYMBOL_GPL(get_cached_msi_msg);
+
+#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+static inline void irq_chip_write_msi_msg(struct irq_data *data,
+ struct msi_msg *msg)
+{
+ data->chip->irq_write_msi_msg(data, msg);
+}
+
+/**
+ * msi_domain_set_affinity - Generic affinity setter function for MSI domains
+ * @irq_data: The irq data associated to the interrupt
+ * @mask: The affinity mask to set
+ * @force: Flag to enforce setting (disable online checks)
+ *
+ * Intended to be used by MSI interrupt controllers which are
+ * implemented with hierarchical domains.
+ */
+int msi_domain_set_affinity(struct irq_data *irq_data,
+ const struct cpumask *mask, bool force)
+{
+ struct irq_data *parent = irq_data->parent_data;
+ struct msi_msg msg;
+ int ret;
+
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
+ BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
+ irq_chip_write_msi_msg(irq_data, &msg);
+ }
+
+ return ret;
+}
+
+static void msi_domain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ struct msi_msg msg;
+
+ BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
+ irq_chip_write_msi_msg(irq_data, &msg);
+}
+
+static void msi_domain_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ struct msi_msg msg;
+
+ memset(&msg, 0, sizeof(msg));
+ irq_chip_write_msi_msg(irq_data, &msg);
+}
+
+static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+ irq_hw_number_t hwirq = ops->get_hwirq(info, arg);
+ int i, ret;
+
+ if (irq_find_mapping(domain, hwirq) > 0)
+ return -EEXIST;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < nr_irqs; i++) {
+ ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
+ if (ret < 0) {
+ if (ops->msi_free) {
+ for (i--; i > 0; i--)
+ ops->msi_free(domain, info, virq + i);
+ }
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void msi_domain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct msi_domain_info *info = domain->host_data;
+ int i;
+
+ if (info->ops->msi_free) {
+ for (i = 0; i < nr_irqs; i++)
+ info->ops->msi_free(domain, info, virq + i);
+ }
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
+}
+
+static struct irq_domain_ops msi_domain_ops = {
+ .alloc = msi_domain_alloc,
+ .free = msi_domain_free,
+ .activate = msi_domain_activate,
+ .deactivate = msi_domain_deactivate,
+};
+
+#ifdef GENERIC_MSI_DOMAIN_OPS
+static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
+ msi_alloc_info_t *arg)
+{
+ return arg->hwirq;
+}
+
+static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg)
+{
+ memset(arg, 0, sizeof(*arg));
+ return 0;
+}
+
+static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
+ struct msi_desc *desc)
+{
+ arg->desc = desc;
+}
+#else
+#define msi_domain_ops_get_hwirq NULL
+#define msi_domain_ops_prepare NULL
+#define msi_domain_ops_set_desc NULL
+#endif /* !GENERIC_MSI_DOMAIN_OPS */
+
+static int msi_domain_ops_init(struct irq_domain *domain,
+ struct msi_domain_info *info,
+ unsigned int virq, irq_hw_number_t hwirq,
+ msi_alloc_info_t *arg)
+{
+ irq_domain_set_hwirq_and_chip(domain, virq, hwirq, info->chip,
+ info->chip_data);
+ if (info->handler && info->handler_name) {
+ __irq_set_handler(virq, info->handler, 0, info->handler_name);
+ if (info->handler_data)
+ irq_set_handler_data(virq, info->handler_data);
+ }
+ return 0;
+}
+
+static int msi_domain_ops_check(struct irq_domain *domain,
+ struct msi_domain_info *info,
+ struct device *dev)
+{
+ return 0;
+}
+
+static struct msi_domain_ops msi_domain_ops_default = {
+ .get_hwirq = msi_domain_ops_get_hwirq,
+ .msi_init = msi_domain_ops_init,
+ .msi_check = msi_domain_ops_check,
+ .msi_prepare = msi_domain_ops_prepare,
+ .set_desc = msi_domain_ops_set_desc,
+};
+
+static void msi_domain_update_dom_ops(struct msi_domain_info *info)
+{
+ struct msi_domain_ops *ops = info->ops;
+
+ if (ops == NULL) {
+ info->ops = &msi_domain_ops_default;
+ return;
+ }
+
+ if (ops->get_hwirq == NULL)
+ ops->get_hwirq = msi_domain_ops_default.get_hwirq;
+ if (ops->msi_init == NULL)
+ ops->msi_init = msi_domain_ops_default.msi_init;
+ if (ops->msi_check == NULL)
+ ops->msi_check = msi_domain_ops_default.msi_check;
+ if (ops->msi_prepare == NULL)
+ ops->msi_prepare = msi_domain_ops_default.msi_prepare;
+ if (ops->set_desc == NULL)
+ ops->set_desc = msi_domain_ops_default.set_desc;
+}
+
+static void msi_domain_update_chip_ops(struct msi_domain_info *info)
+{
+ struct irq_chip *chip = info->chip;
+
+ BUG_ON(!chip);
+ if (!chip->irq_mask)
+ chip->irq_mask = pci_msi_mask_irq;
+ if (!chip->irq_unmask)
+ chip->irq_unmask = pci_msi_unmask_irq;
+ if (!chip->irq_set_affinity)
+ chip->irq_set_affinity = msi_domain_set_affinity;
+}
+
+/**
+ * msi_create_irq_domain - Create a MSI interrupt domain
+ * @of_node: Optional device-tree node of the interrupt controller
+ * @info: MSI domain info
+ * @parent: Parent irq domain
+ */
+struct irq_domain *msi_create_irq_domain(struct device_node *node,
+ struct msi_domain_info *info,
+ struct irq_domain *parent)
+{
+ if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
+ msi_domain_update_dom_ops(info);
+ if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
+ msi_domain_update_chip_ops(info);
+
+ return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops,
+ info);
+}
+
+/**
+ * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
+ * @domain: The domain to allocate from
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @nvec: The number of interrupts to allocate
+ *
+ * Returns 0 on success or an error code.
+ */
+int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+ msi_alloc_info_t arg;
+ struct msi_desc *desc;
+ int i, ret, virq = -1;
+
+ ret = ops->msi_check(domain, info, dev);
+ if (ret == 0)
+ ret = ops->msi_prepare(domain, dev, nvec, &arg);
+ if (ret)
+ return ret;
+
+ for_each_msi_entry(desc, dev) {
+ ops->set_desc(&arg, desc);
+ if (info->flags & MSI_FLAG_IDENTITY_MAP)
+ virq = (int)ops->get_hwirq(info, &arg);
+ else
+ virq = -1;
+
+ virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
+ dev_to_node(dev), &arg, false);
+ if (virq < 0) {
+ ret = -ENOSPC;
+ if (ops->handle_error)
+ ret = ops->handle_error(domain, desc, ret);
+ if (ops->msi_finish)
+ ops->msi_finish(&arg, ret);
+ return ret;
+ }
+
+ for (i = 0; i < desc->nvec_used; i++)
+ irq_set_msi_desc_off(virq, i, desc);
+ }
+
+ if (ops->msi_finish)
+ ops->msi_finish(&arg, 0);
+
+ for_each_msi_entry(desc, dev) {
+ if (desc->nvec_used == 1)
+ dev_dbg(dev, "irq %d for MSI\n", virq);
+ else
+ dev_dbg(dev, "irq [%d-%d] for MSI\n",
+ virq, virq + desc->nvec_used - 1);
+ }
+
+ return 0;
+}
+
+/**
+ * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
+ * @domain: The domain to managing the interrupts
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are free
+ */
+void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+ struct msi_desc *desc;
+
+ for_each_msi_entry(desc, dev) {
+ irq_domain_free_irqs(desc->irq, desc->nvec_used);
+ desc->irq = 0;
+ }
+}
+
+/**
+ * msi_get_domain_info - Get the MSI interrupt domain info for @domain
+ * @domain: The interrupt domain to retrieve data from
+ *
+ * Returns the pointer to the msi_domain_info stored in
+ * @domain->host_data.
+ */
+struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
+{
+ return (struct msi_domain_info *)domain->host_data;
+}
+
+#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index abcd6ca86cb7..3ca532592704 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,17 +9,105 @@
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/suspend.h>
#include <linux/syscore_ops.h>
#include "internals.h"
+bool irq_pm_check_wakeup(struct irq_desc *desc)
+{
+ if (irqd_is_wakeup_armed(&desc->irq_data)) {
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
+ desc->depth++;
+ irq_disable(desc);
+ pm_system_wakeup();
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Called from __setup_irq() with desc->lock held after @action has
+ * been installed in the action chain.
+ */
+void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action)
+{
+ desc->nr_actions++;
+
+ if (action->flags & IRQF_FORCE_RESUME)
+ desc->force_resume_depth++;
+
+ WARN_ON_ONCE(desc->force_resume_depth &&
+ desc->force_resume_depth != desc->nr_actions);
+
+ if (action->flags & IRQF_NO_SUSPEND)
+ desc->no_suspend_depth++;
+
+ WARN_ON_ONCE(desc->no_suspend_depth &&
+ desc->no_suspend_depth != desc->nr_actions);
+}
+
+/*
+ * Called from __free_irq() with desc->lock held after @action has
+ * been removed from the action chain.
+ */
+void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
+{
+ desc->nr_actions--;
+
+ if (action->flags & IRQF_FORCE_RESUME)
+ desc->force_resume_depth--;
+
+ if (action->flags & IRQF_NO_SUSPEND)
+ desc->no_suspend_depth--;
+}
+
+static bool suspend_device_irq(struct irq_desc *desc, int irq)
+{
+ if (!desc->action || desc->no_suspend_depth)
+ return false;
+
+ if (irqd_is_wakeup_set(&desc->irq_data)) {
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ /*
+ * We return true here to force the caller to issue
+ * synchronize_irq(). We need to make sure that the
+ * IRQD_WAKEUP_ARMED is visible before we return from
+ * suspend_device_irqs().
+ */
+ return true;
+ }
+
+ desc->istate |= IRQS_SUSPENDED;
+ __disable_irq(desc, irq);
+
+ /*
+ * Hardware which has no wakeup source configuration facility
+ * requires that the non wakeup interrupts are masked at the
+ * chip level. The chip implementation indicates that with
+ * IRQCHIP_MASK_ON_SUSPEND.
+ */
+ if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+ mask_irq(desc);
+ return true;
+}
+
/**
* suspend_device_irqs - disable all currently enabled interrupt lines
*
- * During system-wide suspend or hibernation device drivers need to be prevented
- * from receiving interrupts and this function is provided for this purpose.
- * It marks all interrupt lines in use, except for the timer ones, as disabled
- * and sets the IRQS_SUSPENDED flag for each of them.
+ * During system-wide suspend or hibernation device drivers need to be
+ * prevented from receiving interrupts and this function is provided
+ * for this purpose.
+ *
+ * So we disable all interrupts and mark them IRQS_SUSPENDED except
+ * for those which are unused, those which are marked as not
+ * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND
+ * set and those which are marked as active wakeup sources.
+ *
+ * The active wakeup sources are handled by the flow handler entry
+ * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the
+ * interrupt and notifies the pm core about the wakeup.
*/
void suspend_device_irqs(void)
{
@@ -28,18 +116,36 @@ void suspend_device_irqs(void)
for_each_irq_desc(irq, desc) {
unsigned long flags;
+ bool sync;
raw_spin_lock_irqsave(&desc->lock, flags);
- __disable_irq(desc, irq, true);
+ sync = suspend_device_irq(desc, irq);
raw_spin_unlock_irqrestore(&desc->lock, flags);
- }
- for_each_irq_desc(irq, desc)
- if (desc->istate & IRQS_SUSPENDED)
+ if (sync)
synchronize_irq(irq);
+ }
}
EXPORT_SYMBOL_GPL(suspend_device_irqs);
+static void resume_irq(struct irq_desc *desc, int irq)
+{
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+
+ if (desc->istate & IRQS_SUSPENDED)
+ goto resume;
+
+ /* Force resume the interrupt? */
+ if (!desc->force_resume_depth)
+ return;
+
+ /* Pretend that it got disabled ! */
+ desc->depth++;
+resume:
+ desc->istate &= ~IRQS_SUSPENDED;
+ __enable_irq(desc, irq);
+}
+
static void resume_irqs(bool want_early)
{
struct irq_desc *desc;
@@ -54,7 +160,7 @@ static void resume_irqs(bool want_early)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
- __enable_irq(desc, irq, true);
+ resume_irq(desc, irq);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
@@ -93,38 +199,3 @@ void resume_device_irqs(void)
resume_irqs(false);
}
EXPORT_SYMBOL_GPL(resume_device_irqs);
-
-/**
- * check_wakeup_irqs - check if any wake-up interrupts are pending
- */
-int check_wakeup_irqs(void)
-{
- struct irq_desc *desc;
- int irq;
-
- for_each_irq_desc(irq, desc) {
- /*
- * Only interrupts which are marked as wakeup source
- * and have not been disabled before the suspend check
- * can abort suspend.
- */
- if (irqd_is_wakeup_set(&desc->irq_data)) {
- if (desc->depth == 1 && desc->istate & IRQS_PENDING)
- return -EBUSY;
- continue;
- }
- /*
- * Check the non wakeup interrupts whether they need
- * to be masked before finally going into suspend
- * state. That's for hardware which has no wakeup
- * source configuration facility. The chip
- * implementation indicates that with
- * IRQCHIP_MASK_ON_SUSPEND.
- */
- if (desc->istate & IRQS_SUSPENDED &&
- irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
- mask_irq(desc);
- }
-
- return 0;
-}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index ac1ba2f11032..9dc9bfd8a678 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -15,6 +15,23 @@
#include "internals.h"
+/*
+ * Access rules:
+ *
+ * procfs protects read/write of /proc/irq/N/ files against a
+ * concurrent free of the interrupt descriptor. remove_proc_entry()
+ * immediately prevents new read/writes to happen and waits for
+ * already running read/write functions to complete.
+ *
+ * We remove the proc entries first and then delete the interrupt
+ * descriptor from the radix tree and free it. So it is guaranteed
+ * that irq_to_desc(N) is valid as long as the read/writes are
+ * permitted by procfs.
+ *
+ * The read from /proc/interrupts is a different problem because there
+ * is no protection. So the lookup and the access to irqdesc
+ * information must be protected by sparse_irq_lock.
+ */
static struct proc_dir_entry *root_irq_dir;
#ifdef CONFIG_SMP
@@ -437,9 +454,10 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
}
+ irq_lock_sparse();
desc = irq_to_desc(i);
if (!desc)
- return 0;
+ goto outsparse;
raw_spin_lock_irqsave(&desc->lock, flags);
for_each_online_cpu(j)
@@ -479,6 +497,8 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
out:
raw_spin_unlock_irqrestore(&desc->lock, flags);
+outsparse:
+ irq_unlock_sparse();
return 0;
}
#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index e6bcbe756663..cbf9fb899d92 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -95,11 +95,11 @@ bool irq_work_queue(struct irq_work *work)
/* If the work is "lazy", handle it from next tick if any */
if (work->flags & IRQ_WORK_LAZY) {
- if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) &&
+ if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
tick_nohz_tick_stopped())
arch_irq_work_raise();
} else {
- if (llist_add(&work->llnode, &__get_cpu_var(raised_list)))
+ if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
arch_irq_work_raise();
}
@@ -113,10 +113,12 @@ bool irq_work_needs_cpu(void)
{
struct llist_head *raised, *lazy;
- raised = &__get_cpu_var(raised_list);
- lazy = &__get_cpu_var(lazy_list);
- if (llist_empty(raised) && llist_empty(lazy))
- return false;
+ raised = this_cpu_ptr(&raised_list);
+ lazy = this_cpu_ptr(&lazy_list);
+
+ if (llist_empty(raised) || arch_irq_work_has_interrupt())
+ if (llist_empty(lazy))
+ return false;
/* All work should have been flushed before going offline */
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
@@ -166,11 +168,20 @@ static void irq_work_run_list(struct llist_head *list)
*/
void irq_work_run(void)
{
- irq_work_run_list(&__get_cpu_var(raised_list));
- irq_work_run_list(&__get_cpu_var(lazy_list));
+ irq_work_run_list(this_cpu_ptr(&raised_list));
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
}
EXPORT_SYMBOL_GPL(irq_work_run);
+void irq_work_tick(void)
+{
+ struct llist_head *raised = this_cpu_ptr(&raised_list);
+
+ if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
+ irq_work_run_list(raised);
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+}
+
/*
* Synchronize against the irq_work @entry, ensures the entry is not
* currently in use.
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index ae5167087845..5c5987f10819 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -565,19 +565,12 @@ static int kallsyms_open(struct inode *inode, struct file *file)
* using get_symbol_offset for every symbol.
*/
struct kallsym_iter *iter;
- int ret;
-
- iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter));
if (!iter)
return -ENOMEM;
reset_iter(iter, 0);
- ret = seq_open(file, &kallsyms_op);
- if (ret == 0)
- ((struct seq_file *)file->private_data)->private = iter;
- else
- kfree(iter);
- return ret;
+ return 0;
}
#ifdef CONFIG_KGDB_KDB
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2bee072268d9..9a8a01abbaed 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
if (!kexec_on_panic) {
image->swap_page = kimage_alloc_control_pages(image, 0);
if (!image->swap_page) {
- pr_err(KERN_ERR "Could not allocate swap buffer\n");
+ pr_err("Could not allocate swap buffer\n");
goto out_free_control_pages;
}
}
@@ -1759,7 +1759,6 @@ static __initdata char *suffix_tbl[] = {
*/
static int __init parse_crashkernel_suffix(char *cmdline,
unsigned long long *crash_size,
- unsigned long long *crash_base,
const char *suffix)
{
char *cur = cmdline;
@@ -1848,7 +1847,7 @@ static int __init __parse_crashkernel(char *cmdline,
if (suffix)
return parse_crashkernel_suffix(ck_cmdline, crash_size,
- crash_base, suffix);
+ suffix);
/*
* if the commandline contains a ':', then that's the extended
* syntax -- if not, it must be the classic syntax
@@ -2016,22 +2015,6 @@ static int __init crash_save_vmcoreinfo_init(void)
subsys_initcall(crash_save_vmcoreinfo_init);
#ifdef CONFIG_KEXEC_FILE
-static int __kexec_add_segment(struct kimage *image, char *buf,
- unsigned long bufsz, unsigned long mem,
- unsigned long memsz)
-{
- struct kexec_segment *ksegment;
-
- ksegment = &image->segment[image->nr_segments];
- ksegment->kbuf = buf;
- ksegment->bufsz = bufsz;
- ksegment->mem = mem;
- ksegment->memsz = memsz;
- image->nr_segments++;
-
- return 0;
-}
-
static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
struct kexec_buf *kbuf)
{
@@ -2064,8 +2047,7 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
} while (1);
/* If we are here, we found a suitable memory range */
- __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
- kbuf->memsz);
+ kbuf->mem = temp_start;
/* Success, stop navigating through remaining System RAM ranges */
return 1;
@@ -2099,8 +2081,7 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
} while (1);
/* If we are here, we found a suitable memory range */
- __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
- kbuf->memsz);
+ kbuf->mem = temp_start;
/* Success, stop navigating through remaining System RAM ranges */
return 1;
@@ -2187,7 +2168,12 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
}
/* Found a suitable memory range */
- ksegment = &image->segment[image->nr_segments - 1];
+ ksegment = &image->segment[image->nr_segments];
+ ksegment->kbuf = kbuf->buffer;
+ ksegment->bufsz = kbuf->bufsz;
+ ksegment->mem = kbuf->mem;
+ ksegment->memsz = kbuf->memsz;
+ image->nr_segments++;
*load_addr = ksegment->mem;
return 0;
}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8637e041a247..2777f40a9c7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -47,13 +47,6 @@ extern int max_threads;
static struct workqueue_struct *khelper_wq;
-/*
- * kmod_thread_locker is used for deadlock avoidance. There is no explicit
- * locking to protect this global - it is private to the singleton khelper
- * thread and should only ever be modified by that thread.
- */
-static const struct task_struct *kmod_thread_locker;
-
#define CAP_BSET (void *)1
#define CAP_PI (void *)2
@@ -196,6 +189,27 @@ int __request_module(bool wait, const char *fmt, ...)
EXPORT_SYMBOL(__request_module);
#endif /* CONFIG_MODULES */
+static void call_usermodehelper_freeinfo(struct subprocess_info *info)
+{
+ if (info->cleanup)
+ (*info->cleanup)(info);
+ kfree(info);
+}
+
+static void umh_complete(struct subprocess_info *sub_info)
+{
+ struct completion *comp = xchg(&sub_info->complete, NULL);
+ /*
+ * See call_usermodehelper_exec(). If xchg() returns NULL
+ * we own sub_info, the UMH_KILLABLE caller has gone away
+ * or the caller used UMH_NO_WAIT.
+ */
+ if (comp)
+ complete(comp);
+ else
+ call_usermodehelper_freeinfo(sub_info);
+}
+
/*
* This is the task which runs the usermode application
*/
@@ -221,7 +235,7 @@ static int ____call_usermodehelper(void *data)
retval = -ENOMEM;
new = prepare_kernel_cred(current);
if (!new)
- goto fail;
+ goto out;
spin_lock(&umh_sysctl_lock);
new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
@@ -233,7 +247,7 @@ static int ____call_usermodehelper(void *data)
retval = sub_info->init(sub_info, new);
if (retval) {
abort_creds(new);
- goto fail;
+ goto out;
}
}
@@ -242,42 +256,16 @@ static int ____call_usermodehelper(void *data)
retval = do_execve(getname_kernel(sub_info->path),
(const char __user *const __user *)sub_info->argv,
(const char __user *const __user *)sub_info->envp);
+out:
+ sub_info->retval = retval;
+ /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
+ if (!(sub_info->wait & UMH_WAIT_PROC))
+ umh_complete(sub_info);
if (!retval)
return 0;
-
- /* Exec failed? */
-fail:
- sub_info->retval = retval;
do_exit(0);
}
-static int call_helper(void *data)
-{
- /* Worker thread started blocking khelper thread. */
- kmod_thread_locker = current;
- return ____call_usermodehelper(data);
-}
-
-static void call_usermodehelper_freeinfo(struct subprocess_info *info)
-{
- if (info->cleanup)
- (*info->cleanup)(info);
- kfree(info);
-}
-
-static void umh_complete(struct subprocess_info *sub_info)
-{
- struct completion *comp = xchg(&sub_info->complete, NULL);
- /*
- * See call_usermodehelper_exec(). If xchg() returns NULL
- * we own sub_info, the UMH_KILLABLE caller has gone away.
- */
- if (comp)
- complete(comp);
- else
- call_usermodehelper_freeinfo(sub_info);
-}
-
/* Keventd can't block, but this (a child) can. */
static int wait_for_helper(void *data)
{
@@ -320,34 +308,17 @@ static void __call_usermodehelper(struct work_struct *work)
{
struct subprocess_info *sub_info =
container_of(work, struct subprocess_info, work);
- int wait = sub_info->wait & ~UMH_KILLABLE;
pid_t pid;
- /* CLONE_VFORK: wait until the usermode helper has execve'd
- * successfully We need the data structures to stay around
- * until that is done. */
- if (wait == UMH_WAIT_PROC)
+ if (sub_info->wait & UMH_WAIT_PROC)
pid = kernel_thread(wait_for_helper, sub_info,
CLONE_FS | CLONE_FILES | SIGCHLD);
- else {
- pid = kernel_thread(call_helper, sub_info,
- CLONE_VFORK | SIGCHLD);
- /* Worker thread stopped blocking khelper thread. */
- kmod_thread_locker = NULL;
- }
-
- switch (wait) {
- case UMH_NO_WAIT:
- call_usermodehelper_freeinfo(sub_info);
- break;
+ else
+ pid = kernel_thread(____call_usermodehelper, sub_info,
+ SIGCHLD);
- case UMH_WAIT_PROC:
- if (pid > 0)
- break;
- /* FALLTHROUGH */
- case UMH_WAIT_EXEC:
- if (pid < 0)
- sub_info->retval = pid;
+ if (pid < 0) {
+ sub_info->retval = pid;
umh_complete(sub_info);
}
}
@@ -578,17 +549,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
goto out;
}
/*
- * Worker thread must not wait for khelper thread at below
- * wait_for_completion() if the thread was created with CLONE_VFORK
- * flag, for khelper thread is already waiting for the thread at
- * wait_for_completion() in do_fork().
+ * Set the completion pointer only if there is a waiter.
+ * This makes it possible to use umh_complete to free
+ * the data structure in case of UMH_NO_WAIT.
*/
- if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
- retval = -EBUSY;
- goto out;
- }
-
- sub_info->complete = &done;
+ sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
sub_info->wait = wait;
queue_work(khelper_wq, &sub_info->work);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3995f546d0f3..06f58309fed2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -915,7 +915,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
#ifdef CONFIG_KPROBES_ON_FTRACE
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
.func = kprobe_ftrace_handler,
- .flags = FTRACE_OPS_FL_SAVE_REGS,
+ .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
};
static int kprobe_ftrace_enabled;
@@ -1410,16 +1410,10 @@ static inline int check_kprobe_rereg(struct kprobe *p)
return ret;
}
-static int check_kprobe_address_safe(struct kprobe *p,
- struct module **probed_mod)
+int __weak arch_check_ftrace_location(struct kprobe *p)
{
- int ret = 0;
unsigned long ftrace_addr;
- /*
- * If the address is located on a ftrace nop, set the
- * breakpoint to the following instruction.
- */
ftrace_addr = ftrace_location((unsigned long)p->addr);
if (ftrace_addr) {
#ifdef CONFIG_KPROBES_ON_FTRACE
@@ -1431,7 +1425,17 @@ static int check_kprobe_address_safe(struct kprobe *p,
return -EINVAL;
#endif
}
+ return 0;
+}
+static int check_kprobe_address_safe(struct kprobe *p,
+ struct module **probed_mod)
+{
+ int ret;
+
+ ret = arch_check_ftrace_location(p);
+ if (ret)
+ return ret;
jump_label_lock();
preempt_disable();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ef483220e855..10e489c448fe 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
{
struct task_struct *p;
- p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt,
+ p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
cpu);
if (IS_ERR(p))
return p;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 0955b885d0dc..ec8cce259779 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -20,30 +20,20 @@
* Author: Paul E. McKenney <paulmck@us.ibm.com>
* Based on kernel/rcu/torture.c.
*/
-#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/init.h>
#include <linux/module.h>
#include <linux/kthread.h>
-#include <linux/err.h>
#include <linux/spinlock.h>
+#include <linux/rwlock.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/completion.h>
#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/reboot.h>
-#include <linux/freezer.h>
-#include <linux/cpu.h>
#include <linux/delay.h>
-#include <linux/stat.h>
#include <linux/slab.h>
-#include <linux/trace_clock.h>
-#include <asm/byteorder.h>
#include <linux/torture.h>
MODULE_LICENSE("GPL");
@@ -51,6 +41,8 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
torture_param(int, nwriters_stress, -1,
"Number of write-locking stress-test threads");
+torture_param(int, nreaders_stress, -1,
+ "Number of read-locking stress-test threads");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
torture_param(int, onoff_interval, 0,
"Time between CPU hotplugs (s), 0=disable");
@@ -66,30 +58,28 @@ torture_param(bool, verbose, true,
static char *torture_type = "spin_lock";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type,
- "Type of lock to torture (spin_lock, spin_lock_irq, ...)");
-
-static atomic_t n_lock_torture_errors;
+ "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
static struct task_struct *stats_task;
static struct task_struct **writer_tasks;
+static struct task_struct **reader_tasks;
-static int nrealwriters_stress;
static bool lock_is_write_held;
+static bool lock_is_read_held;
-struct lock_writer_stress_stats {
- long n_write_lock_fail;
- long n_write_lock_acquired;
+struct lock_stress_stats {
+ long n_lock_fail;
+ long n_lock_acquired;
};
-static struct lock_writer_stress_stats *lwsa;
#if defined(MODULE)
#define LOCKTORTURE_RUNNABLE_INIT 1
#else
#define LOCKTORTURE_RUNNABLE_INIT 0
#endif
-int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
-module_param(locktorture_runnable, int, 0444);
-MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init");
+int torture_runnable = LOCKTORTURE_RUNNABLE_INIT;
+module_param(torture_runnable, int, 0444);
+MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init");
/* Forward reference. */
static void lock_torture_cleanup(void);
@@ -102,12 +92,25 @@ struct lock_torture_ops {
int (*writelock)(void);
void (*write_delay)(struct torture_random_state *trsp);
void (*writeunlock)(void);
+ int (*readlock)(void);
+ void (*read_delay)(struct torture_random_state *trsp);
+ void (*readunlock)(void);
unsigned long flags;
const char *name;
};
-static struct lock_torture_ops *cur_ops;
-
+struct lock_torture_cxt {
+ int nrealwriters_stress;
+ int nrealreaders_stress;
+ bool debug_lock;
+ atomic_t n_lock_torture_errors;
+ struct lock_torture_ops *cur_ops;
+ struct lock_stress_stats *lwsa; /* writer statistics */
+ struct lock_stress_stats *lrsa; /* reader statistics */
+};
+static struct lock_torture_cxt cxt = { 0, 0, false,
+ ATOMIC_INIT(0),
+ NULL, NULL};
/*
* Definitions for lock torture testing.
*/
@@ -123,10 +126,10 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
/* We want a long delay occasionally to force massive contention. */
if (!(torture_random(trsp) %
- (nrealwriters_stress * 2000 * longdelay_us)))
+ (cxt.nrealwriters_stress * 2000 * longdelay_us)))
mdelay(longdelay_us);
#ifdef CONFIG_PREEMPT
- if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
preempt_schedule(); /* Allow test to be preempted. */
#endif
}
@@ -140,6 +143,9 @@ static struct lock_torture_ops lock_busted_ops = {
.writelock = torture_lock_busted_write_lock,
.write_delay = torture_lock_busted_write_delay,
.writeunlock = torture_lock_busted_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
.name = "lock_busted"
};
@@ -160,13 +166,13 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
* we want a long delay occasionally to force massive contention.
*/
if (!(torture_random(trsp) %
- (nrealwriters_stress * 2000 * longdelay_us)))
+ (cxt.nrealwriters_stress * 2000 * longdelay_us)))
mdelay(longdelay_us);
if (!(torture_random(trsp) %
- (nrealwriters_stress * 2 * shortdelay_us)))
+ (cxt.nrealwriters_stress * 2 * shortdelay_us)))
udelay(shortdelay_us);
#ifdef CONFIG_PREEMPT
- if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
preempt_schedule(); /* Allow test to be preempted. */
#endif
}
@@ -180,39 +186,253 @@ static struct lock_torture_ops spin_lock_ops = {
.writelock = torture_spin_lock_write_lock,
.write_delay = torture_spin_lock_write_delay,
.writeunlock = torture_spin_lock_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
.name = "spin_lock"
};
static int torture_spin_lock_write_lock_irq(void)
-__acquires(torture_spinlock_irq)
+__acquires(torture_spinlock)
{
unsigned long flags;
spin_lock_irqsave(&torture_spinlock, flags);
- cur_ops->flags = flags;
+ cxt.cur_ops->flags = flags;
return 0;
}
static void torture_lock_spin_write_unlock_irq(void)
__releases(torture_spinlock)
{
- spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags);
+ spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags);
}
static struct lock_torture_ops spin_lock_irq_ops = {
.writelock = torture_spin_lock_write_lock_irq,
.write_delay = torture_spin_lock_write_delay,
.writeunlock = torture_lock_spin_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
.name = "spin_lock_irq"
};
+static DEFINE_RWLOCK(torture_rwlock);
+
+static int torture_rwlock_write_lock(void) __acquires(torture_rwlock)
+{
+ write_lock(&torture_rwlock);
+ return 0;
+}
+
+static void torture_rwlock_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 2;
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
+ else
+ udelay(shortdelay_us);
+}
+
+static void torture_rwlock_write_unlock(void) __releases(torture_rwlock)
+{
+ write_unlock(&torture_rwlock);
+}
+
+static int torture_rwlock_read_lock(void) __acquires(torture_rwlock)
+{
+ read_lock(&torture_rwlock);
+ return 0;
+}
+
+static void torture_rwlock_read_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 10;
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (!(torture_random(trsp) %
+ (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
+ else
+ udelay(shortdelay_us);
+}
+
+static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
+{
+ read_unlock(&torture_rwlock);
+}
+
+static struct lock_torture_ops rw_lock_ops = {
+ .writelock = torture_rwlock_write_lock,
+ .write_delay = torture_rwlock_write_delay,
+ .writeunlock = torture_rwlock_write_unlock,
+ .readlock = torture_rwlock_read_lock,
+ .read_delay = torture_rwlock_read_delay,
+ .readunlock = torture_rwlock_read_unlock,
+ .name = "rw_lock"
+};
+
+static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&torture_rwlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_rwlock_write_unlock_irq(void)
+__releases(torture_rwlock)
+{
+ write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+}
+
+static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
+{
+ unsigned long flags;
+
+ read_lock_irqsave(&torture_rwlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_rwlock_read_unlock_irq(void)
+__releases(torture_rwlock)
+{
+ write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops rw_lock_irq_ops = {
+ .writelock = torture_rwlock_write_lock_irq,
+ .write_delay = torture_rwlock_write_delay,
+ .writeunlock = torture_rwlock_write_unlock_irq,
+ .readlock = torture_rwlock_read_lock_irq,
+ .read_delay = torture_rwlock_read_delay,
+ .readunlock = torture_rwlock_read_unlock_irq,
+ .name = "rw_lock_irq"
+};
+
+static DEFINE_MUTEX(torture_mutex);
+
+static int torture_mutex_lock(void) __acquires(torture_mutex)
+{
+ mutex_lock(&torture_mutex);
+ return 0;
+}
+
+static void torture_mutex_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms * 5);
+ else
+ mdelay(longdelay_ms / 5);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_mutex_unlock(void) __releases(torture_mutex)
+{
+ mutex_unlock(&torture_mutex);
+}
+
+static struct lock_torture_ops mutex_lock_ops = {
+ .writelock = torture_mutex_lock,
+ .write_delay = torture_mutex_delay,
+ .writeunlock = torture_mutex_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "mutex_lock"
+};
+
+static DECLARE_RWSEM(torture_rwsem);
+static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
+{
+ down_write(&torture_rwsem);
+ return 0;
+}
+
+static void torture_rwsem_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms * 10);
+ else
+ mdelay(longdelay_ms / 10);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_rwsem_up_write(void) __releases(torture_rwsem)
+{
+ up_write(&torture_rwsem);
+}
+
+static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
+{
+ down_read(&torture_rwsem);
+ return 0;
+}
+
+static void torture_rwsem_read_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms * 2);
+ else
+ mdelay(longdelay_ms / 2);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_rwsem_up_read(void) __releases(torture_rwsem)
+{
+ up_read(&torture_rwsem);
+}
+
+static struct lock_torture_ops rwsem_lock_ops = {
+ .writelock = torture_rwsem_down_write,
+ .write_delay = torture_rwsem_write_delay,
+ .writeunlock = torture_rwsem_up_write,
+ .readlock = torture_rwsem_down_read,
+ .read_delay = torture_rwsem_read_delay,
+ .readunlock = torture_rwsem_up_read,
+ .name = "rwsem_lock"
+};
+
/*
* Lock torture writer kthread. Repeatedly acquires and releases
* the lock, checking for duplicate acquisitions.
*/
static int lock_torture_writer(void *arg)
{
- struct lock_writer_stress_stats *lwsp = arg;
+ struct lock_stress_stats *lwsp = arg;
static DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
@@ -221,14 +441,19 @@ static int lock_torture_writer(void *arg)
do {
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
- cur_ops->writelock();
+
+ cxt.cur_ops->writelock();
if (WARN_ON_ONCE(lock_is_write_held))
- lwsp->n_write_lock_fail++;
+ lwsp->n_lock_fail++;
lock_is_write_held = 1;
- lwsp->n_write_lock_acquired++;
- cur_ops->write_delay(&rand);
+ if (WARN_ON_ONCE(lock_is_read_held))
+ lwsp->n_lock_fail++; /* rare, but... */
+
+ lwsp->n_lock_acquired++;
+ cxt.cur_ops->write_delay(&rand);
lock_is_write_held = 0;
- cur_ops->writeunlock();
+ cxt.cur_ops->writeunlock();
+
stutter_wait("lock_torture_writer");
} while (!torture_must_stop());
torture_kthread_stopping("lock_torture_writer");
@@ -236,32 +461,66 @@ static int lock_torture_writer(void *arg)
}
/*
+ * Lock torture reader kthread. Repeatedly acquires and releases
+ * the reader lock.
+ */
+static int lock_torture_reader(void *arg)
+{
+ struct lock_stress_stats *lrsp = arg;
+ static DEFINE_TORTURE_RANDOM(rand);
+
+ VERBOSE_TOROUT_STRING("lock_torture_reader task started");
+ set_user_nice(current, MAX_NICE);
+
+ do {
+ if ((torture_random(&rand) & 0xfffff) == 0)
+ schedule_timeout_uninterruptible(1);
+
+ cxt.cur_ops->readlock();
+ lock_is_read_held = 1;
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lrsp->n_lock_fail++; /* rare, but... */
+
+ lrsp->n_lock_acquired++;
+ cxt.cur_ops->read_delay(&rand);
+ lock_is_read_held = 0;
+ cxt.cur_ops->readunlock();
+
+ stutter_wait("lock_torture_reader");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("lock_torture_reader");
+ return 0;
+}
+
+/*
* Create an lock-torture-statistics message in the specified buffer.
*/
-static void lock_torture_printk(char *page)
+static void __torture_print_stats(char *page,
+ struct lock_stress_stats *statp, bool write)
{
bool fail = 0;
- int i;
+ int i, n_stress;
long max = 0;
- long min = lwsa[0].n_write_lock_acquired;
+ long min = statp[0].n_lock_acquired;
long long sum = 0;
- for (i = 0; i < nrealwriters_stress; i++) {
- if (lwsa[i].n_write_lock_fail)
+ n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
+ for (i = 0; i < n_stress; i++) {
+ if (statp[i].n_lock_fail)
fail = true;
- sum += lwsa[i].n_write_lock_acquired;
- if (max < lwsa[i].n_write_lock_fail)
- max = lwsa[i].n_write_lock_fail;
- if (min > lwsa[i].n_write_lock_fail)
- min = lwsa[i].n_write_lock_fail;
+ sum += statp[i].n_lock_acquired;
+ if (max < statp[i].n_lock_fail)
+ max = statp[i].n_lock_fail;
+ if (min > statp[i].n_lock_fail)
+ min = statp[i].n_lock_fail;
}
- page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
page += sprintf(page,
- "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
+ "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
+ write ? "Writes" : "Reads ",
sum, max, min, max / 2 > min ? "???" : "",
fail, fail ? "!!!" : "");
if (fail)
- atomic_inc(&n_lock_torture_errors);
+ atomic_inc(&cxt.n_lock_torture_errors);
}
/*
@@ -274,18 +533,35 @@ static void lock_torture_printk(char *page)
*/
static void lock_torture_stats_print(void)
{
- int size = nrealwriters_stress * 200 + 8192;
+ int size = cxt.nrealwriters_stress * 200 + 8192;
char *buf;
+ if (cxt.cur_ops->readlock)
+ size += cxt.nrealreaders_stress * 200 + 8192;
+
buf = kmalloc(size, GFP_KERNEL);
if (!buf) {
pr_err("lock_torture_stats_print: Out of memory, need: %d",
size);
return;
}
- lock_torture_printk(buf);
+
+ __torture_print_stats(buf, cxt.lwsa, true);
pr_alert("%s", buf);
kfree(buf);
+
+ if (cxt.cur_ops->readlock) {
+ buf = kmalloc(size, GFP_KERNEL);
+ if (!buf) {
+ pr_err("lock_torture_stats_print: Out of memory, need: %d",
+ size);
+ return;
+ }
+
+ __torture_print_stats(buf, cxt.lrsa, false);
+ pr_alert("%s", buf);
+ kfree(buf);
+ }
}
/*
@@ -312,9 +588,10 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
const char *tag)
{
pr_alert("%s" TORTURE_FLAG
- "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
- torture_type, tag, nrealwriters_stress, stat_interval, verbose,
- shuffle_interval, stutter, shutdown_secs,
+ "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ torture_type, tag, cxt.debug_lock ? " [debug]": "",
+ cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
+ verbose, shuffle_interval, stutter, shutdown_secs,
onoff_interval, onoff_holdoff);
}
@@ -322,46 +599,59 @@ static void lock_torture_cleanup(void)
{
int i;
- if (torture_cleanup())
+ if (torture_cleanup_begin())
return;
if (writer_tasks) {
- for (i = 0; i < nrealwriters_stress; i++)
+ for (i = 0; i < cxt.nrealwriters_stress; i++)
torture_stop_kthread(lock_torture_writer,
writer_tasks[i]);
kfree(writer_tasks);
writer_tasks = NULL;
}
+ if (reader_tasks) {
+ for (i = 0; i < cxt.nrealreaders_stress; i++)
+ torture_stop_kthread(lock_torture_reader,
+ reader_tasks[i]);
+ kfree(reader_tasks);
+ reader_tasks = NULL;
+ }
+
torture_stop_kthread(lock_torture_stats, stats_task);
lock_torture_stats_print(); /* -After- the stats thread is stopped! */
- if (atomic_read(&n_lock_torture_errors))
- lock_torture_print_module_parms(cur_ops,
+ if (atomic_read(&cxt.n_lock_torture_errors))
+ lock_torture_print_module_parms(cxt.cur_ops,
"End of test: FAILURE");
else if (torture_onoff_failures())
- lock_torture_print_module_parms(cur_ops,
+ lock_torture_print_module_parms(cxt.cur_ops,
"End of test: LOCK_HOTPLUG");
else
- lock_torture_print_module_parms(cur_ops,
+ lock_torture_print_module_parms(cxt.cur_ops,
"End of test: SUCCESS");
+ torture_cleanup_end();
}
static int __init lock_torture_init(void)
{
- int i;
+ int i, j;
int firsterr = 0;
static struct lock_torture_ops *torture_ops[] = {
- &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
+ &lock_busted_ops,
+ &spin_lock_ops, &spin_lock_irq_ops,
+ &rw_lock_ops, &rw_lock_irq_ops,
+ &mutex_lock_ops,
+ &rwsem_lock_ops,
};
- if (!torture_init_begin(torture_type, verbose, &locktorture_runnable))
+ if (!torture_init_begin(torture_type, verbose, &torture_runnable))
return -EBUSY;
/* Process args and tell the world that the torturer is on the job. */
for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
- cur_ops = torture_ops[i];
- if (strcmp(torture_type, cur_ops->name) == 0)
+ cxt.cur_ops = torture_ops[i];
+ if (strcmp(torture_type, cxt.cur_ops->name) == 0)
break;
}
if (i == ARRAY_SIZE(torture_ops)) {
@@ -374,31 +664,69 @@ static int __init lock_torture_init(void)
torture_init_end();
return -EINVAL;
}
- if (cur_ops->init)
- cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+ if (cxt.cur_ops->init)
+ cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */
if (nwriters_stress >= 0)
- nrealwriters_stress = nwriters_stress;
+ cxt.nrealwriters_stress = nwriters_stress;
else
- nrealwriters_stress = 2 * num_online_cpus();
- lock_torture_print_module_parms(cur_ops, "Start of test");
+ cxt.nrealwriters_stress = 2 * num_online_cpus();
+
+#ifdef CONFIG_DEBUG_MUTEXES
+ if (strncmp(torture_type, "mutex", 5) == 0)
+ cxt.debug_lock = true;
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+ if ((strncmp(torture_type, "spin", 4) == 0) ||
+ (strncmp(torture_type, "rw_lock", 7) == 0))
+ cxt.debug_lock = true;
+#endif
/* Initialize the statistics so that each run gets its own numbers. */
lock_is_write_held = 0;
- lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL);
- if (lwsa == NULL) {
- VERBOSE_TOROUT_STRING("lwsa: Out of memory");
+ cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL);
+ if (cxt.lwsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
firsterr = -ENOMEM;
goto unwind;
}
- for (i = 0; i < nrealwriters_stress; i++) {
- lwsa[i].n_write_lock_fail = 0;
- lwsa[i].n_write_lock_acquired = 0;
+ for (i = 0; i < cxt.nrealwriters_stress; i++) {
+ cxt.lwsa[i].n_lock_fail = 0;
+ cxt.lwsa[i].n_lock_acquired = 0;
}
- /* Start up the kthreads. */
+ if (cxt.cur_ops->readlock) {
+ if (nreaders_stress >= 0)
+ cxt.nrealreaders_stress = nreaders_stress;
+ else {
+ /*
+ * By default distribute evenly the number of
+ * readers and writers. We still run the same number
+ * of threads as the writer-only locks default.
+ */
+ if (nwriters_stress < 0) /* user doesn't care */
+ cxt.nrealwriters_stress = num_online_cpus();
+ cxt.nrealreaders_stress = cxt.nrealwriters_stress;
+ }
+
+ lock_is_read_held = 0;
+ cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL);
+ if (cxt.lrsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
+ firsterr = -ENOMEM;
+ kfree(cxt.lwsa);
+ goto unwind;
+ }
+
+ for (i = 0; i < cxt.nrealreaders_stress; i++) {
+ cxt.lrsa[i].n_lock_fail = 0;
+ cxt.lrsa[i].n_lock_acquired = 0;
+ }
+ }
+ lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
+ /* Prepare torture context. */
if (onoff_interval > 0) {
firsterr = torture_onoff_init(onoff_holdoff * HZ,
onoff_interval * HZ);
@@ -422,18 +750,51 @@ static int __init lock_torture_init(void)
goto unwind;
}
- writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]),
+ writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]),
GFP_KERNEL);
if (writer_tasks == NULL) {
VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
firsterr = -ENOMEM;
goto unwind;
}
- for (i = 0; i < nrealwriters_stress; i++) {
- firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i],
+
+ if (cxt.cur_ops->readlock) {
+ reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (reader_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ }
+
+ /*
+ * Create the kthreads and start torturing (oh, those poor little locks).
+ *
+ * TODO: Note that we interleave writers with readers, giving writers a
+ * slight advantage, by creating its kthread first. This can be modified
+ * for very specific needs, or even let the user choose the policy, if
+ * ever wanted.
+ */
+ for (i = 0, j = 0; i < cxt.nrealwriters_stress ||
+ j < cxt.nrealreaders_stress; i++, j++) {
+ if (i >= cxt.nrealwriters_stress)
+ goto create_reader;
+
+ /* Create writer. */
+ firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
writer_tasks[i]);
if (firsterr)
goto unwind;
+
+ create_reader:
+ if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
+ continue;
+ /* Create reader. */
+ firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j],
+ reader_tasks[j]);
+ if (firsterr)
+ goto unwind;
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(lock_torture_stats, NULL,
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 23e89c5930e9..4d60986fcbee 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -56,9 +56,6 @@ do { \
* If the lock has already been acquired, then this will proceed to spin
* on this node->locked until the previous lock holder sets the node->locked
* in mcs_spin_unlock().
- *
- * We don't inline mcs_spin_lock() so that perf can correctly account for the
- * time spent in this lock function.
*/
static inline
void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 5cf6731b98e9..3ef3736002d8 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -80,13 +80,13 @@ void debug_mutex_unlock(struct mutex *lock)
DEBUG_LOCKS_WARN_ON(lock->owner != current);
DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
- mutex_clear_owner(lock);
}
/*
* __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
* mutexes so that we can do it here after we've verified state.
*/
+ mutex_clear_owner(lock);
atomic_set(&lock->count, 1);
}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index ae712b25e492..454195194d4a 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -15,7 +15,7 @@
* by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
* and Sven Dietrich.
*
- * Also see Documentation/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.txt.
*/
#include <linux/mutex.h>
#include <linux/ww_mutex.h>
@@ -106,6 +106,92 @@ void __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
#endif
+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
+ struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+ /*
+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+ * but released with a normal mutex_unlock in this call.
+ *
+ * This should never happen, always use ww_mutex_unlock.
+ */
+ DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+ /*
+ * Not quite done after calling ww_acquire_done() ?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+ if (ww_ctx->contending_lock) {
+ /*
+ * After -EDEADLK you tried to
+ * acquire a different ww_mutex? Bad!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+ /*
+ * You called ww_mutex_lock after receiving -EDEADLK,
+ * but 'forgot' to unlock everything else first?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+ ww_ctx->contending_lock = NULL;
+ }
+
+ /*
+ * Naughty, using a different class will lead to undefined behavior!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+ ww_ctx->acquired++;
+}
+
+/*
+ * after acquiring lock with fastpath or when we lost out in contested
+ * slowpath, set ctx and wake up any waiters so they can recheck.
+ *
+ * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
+ * as the fastpath and opportunistic spinning are disabled in that case.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ctx)
+{
+ unsigned long flags;
+ struct mutex_waiter *cur;
+
+ ww_mutex_lock_acquired(lock, ctx);
+
+ lock->ctx = ctx;
+
+ /*
+ * The lock->ctx update should be visible on all cores before
+ * the atomic read is done, otherwise contended waiters might be
+ * missed. The contended waiters will either see ww_ctx == NULL
+ * and keep spinning, or it will acquire wait_lock, add itself
+ * to waiter list and sleep.
+ */
+ smp_mb(); /* ^^^ */
+
+ /*
+ * Check if lock is contended, if not there is nobody to wake up
+ */
+ if (likely(atomic_read(&lock->base.count) == 0))
+ return;
+
+ /*
+ * Uh oh, we raced in fastpath, wake up everyone in this case,
+ * so they can see the new lock->ctx.
+ */
+ spin_lock_mutex(&lock->base.wait_lock, flags);
+ list_for_each_entry(cur, &lock->base.wait_list, list) {
+ debug_mutex_wake_waiter(&lock->base, cur);
+ wake_up_process(cur->task);
+ }
+ spin_unlock_mutex(&lock->base.wait_lock, flags);
+}
+
+
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
* In order to avoid a stampede of mutex spinners from acquiring the mutex
@@ -180,6 +266,135 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
*/
return retval;
}
+
+/*
+ * Atomically try to take the lock when it is available
+ */
+static inline bool mutex_try_to_acquire(struct mutex *lock)
+{
+ return !mutex_is_locked(lock) &&
+ (atomic_cmpxchg(&lock->count, 1, 0) == 1);
+}
+
+/*
+ * Optimistic spinning.
+ *
+ * We try to spin for acquisition when we find that the lock owner
+ * is currently running on a (different) CPU and while we don't
+ * need to reschedule. The rationale is that if the lock owner is
+ * running, it is likely to release the lock soon.
+ *
+ * Since this needs the lock owner, and this mutex implementation
+ * doesn't track the owner atomically in the lock field, we need to
+ * track it non-atomically.
+ *
+ * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
+ * to serialize everything.
+ *
+ * The mutex spinners are queued up using MCS lock so that only one
+ * spinner can compete for the mutex. However, if mutex spinning isn't
+ * going to happen, there is no point in going through the lock/unlock
+ * overhead.
+ *
+ * Returns true when the lock was taken, otherwise false, indicating
+ * that we need to jump to the slowpath and sleep.
+ */
+static bool mutex_optimistic_spin(struct mutex *lock,
+ struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+{
+ struct task_struct *task = current;
+
+ if (!mutex_can_spin_on_owner(lock))
+ goto done;
+
+ if (!osq_lock(&lock->osq))
+ goto done;
+
+ while (true) {
+ struct task_struct *owner;
+
+ if (use_ww_ctx && ww_ctx->acquired > 0) {
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+ /*
+ * If ww->ctx is set the contents are undefined, only
+ * by acquiring wait_lock there is a guarantee that
+ * they are not invalid when reading.
+ *
+ * As such, when deadlock detection needs to be
+ * performed the optimistic spinning cannot be done.
+ */
+ if (ACCESS_ONCE(ww->ctx))
+ break;
+ }
+
+ /*
+ * If there's an owner, wait for it to either
+ * release the lock or go to sleep.
+ */
+ owner = ACCESS_ONCE(lock->owner);
+ if (owner && !mutex_spin_on_owner(lock, owner))
+ break;
+
+ /* Try to acquire the mutex if it is unlocked. */
+ if (mutex_try_to_acquire(lock)) {
+ lock_acquired(&lock->dep_map, ip);
+
+ if (use_ww_ctx) {
+ struct ww_mutex *ww;
+ ww = container_of(lock, struct ww_mutex, base);
+
+ ww_mutex_set_context_fastpath(ww, ww_ctx);
+ }
+
+ mutex_set_owner(lock);
+ osq_unlock(&lock->osq);
+ return true;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(task)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax_lowlatency();
+ }
+
+ osq_unlock(&lock->osq);
+done:
+ /*
+ * If we fell out of the spin path because of need_resched(),
+ * reschedule now, before we try-lock the mutex. This avoids getting
+ * scheduled out right after we obtained the mutex.
+ */
+ if (need_resched()) {
+ /*
+ * We _should_ have TASK_RUNNING here, but just in case
+ * we do not, make it so, otherwise we might get stuck.
+ */
+ __set_current_state(TASK_RUNNING);
+ schedule_preempt_disabled();
+ }
+
+ return false;
+}
+#else
+static bool mutex_optimistic_spin(struct mutex *lock,
+ struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+{
+ return false;
+}
#endif
__visible __used noinline
@@ -277,91 +492,6 @@ __mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
return 0;
}
-static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
- struct ww_acquire_ctx *ww_ctx)
-{
-#ifdef CONFIG_DEBUG_MUTEXES
- /*
- * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
- * but released with a normal mutex_unlock in this call.
- *
- * This should never happen, always use ww_mutex_unlock.
- */
- DEBUG_LOCKS_WARN_ON(ww->ctx);
-
- /*
- * Not quite done after calling ww_acquire_done() ?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
-
- if (ww_ctx->contending_lock) {
- /*
- * After -EDEADLK you tried to
- * acquire a different ww_mutex? Bad!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
-
- /*
- * You called ww_mutex_lock after receiving -EDEADLK,
- * but 'forgot' to unlock everything else first?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
- ww_ctx->contending_lock = NULL;
- }
-
- /*
- * Naughty, using a different class will lead to undefined behavior!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
-#endif
- ww_ctx->acquired++;
-}
-
-/*
- * after acquiring lock with fastpath or when we lost out in contested
- * slowpath, set ctx and wake up any waiters so they can recheck.
- *
- * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
- * as the fastpath and opportunistic spinning are disabled in that case.
- */
-static __always_inline void
-ww_mutex_set_context_fastpath(struct ww_mutex *lock,
- struct ww_acquire_ctx *ctx)
-{
- unsigned long flags;
- struct mutex_waiter *cur;
-
- ww_mutex_lock_acquired(lock, ctx);
-
- lock->ctx = ctx;
-
- /*
- * The lock->ctx update should be visible on all cores before
- * the atomic read is done, otherwise contended waiters might be
- * missed. The contended waiters will either see ww_ctx == NULL
- * and keep spinning, or it will acquire wait_lock, add itself
- * to waiter list and sleep.
- */
- smp_mb(); /* ^^^ */
-
- /*
- * Check if lock is contended, if not there is nobody to wake up
- */
- if (likely(atomic_read(&lock->base.count) == 0))
- return;
-
- /*
- * Uh oh, we raced in fastpath, wake up everyone in this case,
- * so they can see the new lock->ctx.
- */
- spin_lock_mutex(&lock->base.wait_lock, flags);
- list_for_each_entry(cur, &lock->base.wait_list, list) {
- debug_mutex_wake_waiter(&lock->base, cur);
- wake_up_process(cur->task);
- }
- spin_unlock_mutex(&lock->base.wait_lock, flags);
-}
-
/*
* Lock a mutex (possibly interruptible), slowpath:
*/
@@ -378,104 +508,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- /*
- * Optimistic spinning.
- *
- * We try to spin for acquisition when we find that the lock owner
- * is currently running on a (different) CPU and while we don't
- * need to reschedule. The rationale is that if the lock owner is
- * running, it is likely to release the lock soon.
- *
- * Since this needs the lock owner, and this mutex implementation
- * doesn't track the owner atomically in the lock field, we need to
- * track it non-atomically.
- *
- * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
- * to serialize everything.
- *
- * The mutex spinners are queued up using MCS lock so that only one
- * spinner can compete for the mutex. However, if mutex spinning isn't
- * going to happen, there is no point in going through the lock/unlock
- * overhead.
- */
- if (!mutex_can_spin_on_owner(lock))
- goto slowpath;
-
- if (!osq_lock(&lock->osq))
- goto slowpath;
-
- for (;;) {
- struct task_struct *owner;
-
- if (use_ww_ctx && ww_ctx->acquired > 0) {
- struct ww_mutex *ww;
-
- ww = container_of(lock, struct ww_mutex, base);
- /*
- * If ww->ctx is set the contents are undefined, only
- * by acquiring wait_lock there is a guarantee that
- * they are not invalid when reading.
- *
- * As such, when deadlock detection needs to be
- * performed the optimistic spinning cannot be done.
- */
- if (ACCESS_ONCE(ww->ctx))
- break;
- }
-
- /*
- * If there's an owner, wait for it to either
- * release the lock or go to sleep.
- */
- owner = ACCESS_ONCE(lock->owner);
- if (owner && !mutex_spin_on_owner(lock, owner))
- break;
-
- /* Try to acquire the mutex if it is unlocked. */
- if (!mutex_is_locked(lock) &&
- (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
- lock_acquired(&lock->dep_map, ip);
- if (use_ww_ctx) {
- struct ww_mutex *ww;
- ww = container_of(lock, struct ww_mutex, base);
-
- ww_mutex_set_context_fastpath(ww, ww_ctx);
- }
-
- mutex_set_owner(lock);
- osq_unlock(&lock->osq);
- preempt_enable();
- return 0;
- }
-
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
- */
- if (!owner && (need_resched() || rt_task(task)))
- break;
-
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
- cpu_relax_lowlatency();
+ if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
+ /* got the lock, yay! */
+ preempt_enable();
+ return 0;
}
- osq_unlock(&lock->osq);
-slowpath:
- /*
- * If we fell out of the spin path because of need_resched(),
- * reschedule now, before we try-lock the mutex. This avoids getting
- * scheduled out right after we obtained the mutex.
- */
- if (need_resched())
- schedule_preempt_disabled();
-#endif
+
spin_lock_mutex(&lock->wait_lock, flags);
/*
@@ -679,15 +717,21 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
* Release the lock, slowpath:
*/
static inline void
-__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
+__mutex_unlock_common_slowpath(struct mutex *lock, int nested)
{
- struct mutex *lock = container_of(lock_count, struct mutex, count);
unsigned long flags;
/*
- * some architectures leave the lock unlocked in the fastpath failure
+ * As a performance measurement, release the lock before doing other
+ * wakeup related duties to follow. This allows other tasks to acquire
+ * the lock sooner, while still handling cleanups in past unlock calls.
+ * This can be done as we do not enforce strict equivalence between the
+ * mutex counter and wait_list.
+ *
+ *
+ * Some architectures leave the lock unlocked in the fastpath failure
* case, others need to leave it locked. In the later case we have to
- * unlock it here
+ * unlock it here - as the lock counter is currently 0 or negative.
*/
if (__mutex_slowpath_needs_to_unlock())
atomic_set(&lock->count, 1);
@@ -716,7 +760,9 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
__visible void
__mutex_unlock_slowpath(atomic_t *lock_count)
{
- __mutex_unlock_common_slowpath(lock_count, 1);
+ struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+ __mutex_unlock_common_slowpath(lock, 1);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 4115fbf83b12..5cda397607f2 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -16,7 +16,7 @@
#define mutex_remove_waiter(lock, waiter, ti) \
__list_del((waiter)->list.prev, (waiter)->list.next)
-#ifdef CONFIG_SMP
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
static inline void mutex_set_owner(struct mutex *lock)
{
lock->owner = current;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index a0ea2a141b3b..7c98873a3077 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -8,7 +8,7 @@
* Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
* Copyright (C) 2006 Esben Nielsen
*
- * See Documentation/rt-mutex-design.txt for details.
+ * See Documentation/locking/rt-mutex-design.txt for details.
*/
#include <linux/spinlock.h>
#include <linux/export.h>
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index d6203faf2eb1..7628c3fc37ca 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -246,19 +246,22 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
return sem;
}
+EXPORT_SYMBOL(rwsem_down_read_failed);
static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
{
- if (!(count & RWSEM_ACTIVE_MASK)) {
- /* try acquiring the write lock */
- if (sem->count == RWSEM_WAITING_BIAS &&
- cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
- RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
- if (!list_is_singular(&sem->wait_list))
- rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
- return true;
- }
+ /*
+ * Try acquiring the write lock. Check count first in order
+ * to reduce unnecessary expensive cmpxchg() operations.
+ */
+ if (count == RWSEM_WAITING_BIAS &&
+ cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+ RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
+ if (!list_is_singular(&sem->wait_list))
+ rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ return true;
}
+
return false;
}
@@ -465,6 +468,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
return sem;
}
+EXPORT_SYMBOL(rwsem_down_write_failed);
/*
* handle waking up a waiter on the semaphore
@@ -485,6 +489,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
return sem;
}
+EXPORT_SYMBOL(rwsem_wake);
/*
* downgrade a write lock into a read lock
@@ -506,8 +511,4 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
return sem;
}
-
-EXPORT_SYMBOL(rwsem_down_read_failed);
-EXPORT_SYMBOL(rwsem_down_write_failed);
-EXPORT_SYMBOL(rwsem_wake);
EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 6815171a4fff..b8120abe594b 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -36,7 +36,7 @@
static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
static noinline int __down_killable(struct semaphore *sem);
-static noinline int __down_timeout(struct semaphore *sem, long jiffies);
+static noinline int __down_timeout(struct semaphore *sem, long timeout);
static noinline void __up(struct semaphore *sem);
/**
@@ -145,14 +145,14 @@ EXPORT_SYMBOL(down_trylock);
/**
* down_timeout - acquire the semaphore within a specified time
* @sem: the semaphore to be acquired
- * @jiffies: how long to wait before failing
+ * @timeout: how long to wait before failing
*
* Attempts to acquire the semaphore. If no more tasks are allowed to
* acquire the semaphore, calling this function will put the task to sleep.
* If the semaphore is not released within the specified number of jiffies,
* this function returns -ETIME. It returns 0 if the semaphore was acquired.
*/
-int down_timeout(struct semaphore *sem, long jiffies)
+int down_timeout(struct semaphore *sem, long timeout)
{
unsigned long flags;
int result = 0;
@@ -161,7 +161,7 @@ int down_timeout(struct semaphore *sem, long jiffies)
if (likely(sem->count > 0))
sem->count--;
else
- result = __down_timeout(sem, jiffies);
+ result = __down_timeout(sem, timeout);
raw_spin_unlock_irqrestore(&sem->lock, flags);
return result;
@@ -248,9 +248,9 @@ static noinline int __sched __down_killable(struct semaphore *sem)
return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
}
-static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies)
+static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
{
- return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies);
+ return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
}
static noinline void __sched __up(struct semaphore *sem)
diff --git a/kernel/module.c b/kernel/module.c
index 03214bd288e9..3965511ae133 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -42,7 +42,6 @@
#include <linux/vermagic.h>
#include <linux/notifier.h>
#include <linux/sched.h>
-#include <linux/stop_machine.h>
#include <linux/device.h>
#include <linux/string.h>
#include <linux/mutex.h>
@@ -98,7 +97,7 @@
* 1) List of modules (also safely readable with preempt_disable),
* 2) module_use links,
* 3) module_addr_min/module_addr_max.
- * (delete uses stop_machine/add uses RCU list operations). */
+ * (delete and add uses RCU list operations). */
DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
@@ -135,7 +134,7 @@ static int param_set_bool_enable_only(const char *val,
}
static const struct kernel_param_ops param_ops_bool_enable_only = {
- .flags = KERNEL_PARAM_FL_NOARG,
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bool_enable_only,
.get = param_get_bool,
};
@@ -158,13 +157,13 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list);
* Protected by module_mutex. */
static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-int register_module_notifier(struct notifier_block * nb)
+int register_module_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&module_notify_list, nb);
}
EXPORT_SYMBOL(register_module_notifier);
-int unregister_module_notifier(struct notifier_block * nb)
+int unregister_module_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&module_notify_list, nb);
}
@@ -628,18 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
EXPORT_TRACEPOINT_SYMBOL(module_get);
+/* MODULE_REF_BASE is the base reference count by kmodule loader. */
+#define MODULE_REF_BASE 1
+
/* Init the unload section of the module. */
static int module_unload_init(struct module *mod)
{
- mod->refptr = alloc_percpu(struct module_ref);
- if (!mod->refptr)
- return -ENOMEM;
+ /*
+ * Initialize reference counter to MODULE_REF_BASE.
+ * refcnt == 0 means module is going.
+ */
+ atomic_set(&mod->refcnt, MODULE_REF_BASE);
INIT_LIST_HEAD(&mod->source_list);
INIT_LIST_HEAD(&mod->target_list);
/* Hold reference count during initialization. */
- raw_cpu_write(mod->refptr->incs, 1);
+ atomic_inc(&mod->refcnt);
return 0;
}
@@ -721,8 +725,6 @@ static void module_unload_free(struct module *mod)
kfree(use);
}
mutex_unlock(&module_mutex);
-
- free_percpu(mod->refptr);
}
#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -740,60 +742,39 @@ static inline int try_force_unload(unsigned int flags)
}
#endif /* CONFIG_MODULE_FORCE_UNLOAD */
-struct stopref
+/* Try to release refcount of module, 0 means success. */
+static int try_release_module_ref(struct module *mod)
{
- struct module *mod;
- int flags;
- int *forced;
-};
+ int ret;
-/* Whole machine is stopped with interrupts off when this runs. */
-static int __try_stop_module(void *_sref)
-{
- struct stopref *sref = _sref;
+ /* Try to decrement refcnt which we set at loading */
+ ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
+ BUG_ON(ret < 0);
+ if (ret)
+ /* Someone can put this right now, recover with checking */
+ ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
+
+ return ret;
+}
+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
/* If it's not unused, quit unless we're forcing. */
- if (module_refcount(sref->mod) != 0) {
- if (!(*sref->forced = try_force_unload(sref->flags)))
+ if (try_release_module_ref(mod) != 0) {
+ *forced = try_force_unload(flags);
+ if (!(*forced))
return -EWOULDBLOCK;
}
/* Mark it as dying. */
- sref->mod->state = MODULE_STATE_GOING;
- return 0;
-}
+ mod->state = MODULE_STATE_GOING;
-static int try_stop_module(struct module *mod, int flags, int *forced)
-{
- struct stopref sref = { mod, flags, forced };
-
- return stop_machine(__try_stop_module, &sref, NULL);
+ return 0;
}
unsigned long module_refcount(struct module *mod)
{
- unsigned long incs = 0, decs = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- decs += per_cpu_ptr(mod->refptr, cpu)->decs;
- /*
- * ensure the incs are added up after the decs.
- * module_put ensures incs are visible before decs with smp_wmb.
- *
- * This 2-count scheme avoids the situation where the refcount
- * for CPU0 is read, then CPU0 increments the module refcount,
- * then CPU1 drops that refcount, then the refcount for CPU1 is
- * read. We would record a decrement but not its corresponding
- * increment so we would see a low count (disaster).
- *
- * Rare situation? But module_refcount can be preempted, and we
- * might be tallying up 4096+ CPUs. So it is not impossible.
- */
- smp_rmb();
- for_each_possible_cpu(cpu)
- incs += per_cpu_ptr(mod->refptr, cpu)->incs;
- return incs - decs;
+ return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE;
}
EXPORT_SYMBOL(module_refcount);
@@ -877,8 +858,10 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
seq_printf(m, " %lu ", module_refcount(mod));
- /* Always include a trailing , so userspace can differentiate
- between this and the old multi-field proc format. */
+ /*
+ * Always include a trailing , so userspace can differentiate
+ * between this and the old multi-field proc format.
+ */
list_for_each_entry(use, &mod->source_list, source_list) {
printed_something = 1;
seq_printf(m, "%s,", use->source->name);
@@ -886,11 +869,11 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
if (mod->init != NULL && mod->exit == NULL) {
printed_something = 1;
- seq_printf(m, "[permanent],");
+ seq_puts(m, "[permanent],");
}
if (!printed_something)
- seq_printf(m, "-");
+ seq_puts(m, "-");
}
void __symbol_put(const char *symbol)
@@ -935,7 +918,7 @@ void __module_get(struct module *module)
{
if (module) {
preempt_disable();
- __this_cpu_inc(module->refptr->incs);
+ atomic_inc(&module->refcnt);
trace_module_get(module, _RET_IP_);
preempt_enable();
}
@@ -948,11 +931,11 @@ bool try_module_get(struct module *module)
if (module) {
preempt_disable();
-
- if (likely(module_is_live(module))) {
- __this_cpu_inc(module->refptr->incs);
+ /* Note: here, we can fail to get a reference */
+ if (likely(module_is_live(module) &&
+ atomic_inc_not_zero(&module->refcnt) != 0))
trace_module_get(module, _RET_IP_);
- } else
+ else
ret = false;
preempt_enable();
@@ -963,11 +946,12 @@ EXPORT_SYMBOL(try_module_get);
void module_put(struct module *module)
{
+ int ret;
+
if (module) {
preempt_disable();
- smp_wmb(); /* see comment in module_refcount */
- __this_cpu_inc(module->refptr->decs);
-
+ ret = atomic_dec_if_positive(&module->refcnt);
+ WARN_ON(ret < 0); /* Failed to put refcount */
trace_module_put(module, _RET_IP_);
preempt_enable();
}
@@ -978,7 +962,7 @@ EXPORT_SYMBOL(module_put);
static inline void print_unload_info(struct seq_file *m, struct module *mod)
{
/* We don't know the usage count, or what modules are using. */
- seq_printf(m, " - -");
+ seq_puts(m, " - -");
}
static inline void module_unload_free(struct module *mod)
@@ -1131,7 +1115,7 @@ static unsigned long maybe_relocated(unsigned long crc,
static int check_version(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *symname,
- struct module *mod,
+ struct module *mod,
const unsigned long *crc,
const struct module *crc_owner)
{
@@ -1165,7 +1149,7 @@ static int check_version(Elf_Shdr *sechdrs,
return 0;
bad_version:
- printk("%s: disagrees about version of symbol %s\n",
+ pr_warn("%s: disagrees about version of symbol %s\n",
mod->name, symname);
return 0;
}
@@ -1200,7 +1184,7 @@ static inline int same_magic(const char *amagic, const char *bmagic,
static inline int check_version(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *symname,
- struct module *mod,
+ struct module *mod,
const unsigned long *crc,
const struct module *crc_owner)
{
@@ -1288,15 +1272,13 @@ static inline bool sect_empty(const Elf_Shdr *sect)
return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
}
-struct module_sect_attr
-{
+struct module_sect_attr {
struct module_attribute mattr;
char *name;
unsigned long address;
};
-struct module_sect_attrs
-{
+struct module_sect_attrs {
struct attribute_group grp;
unsigned int nsections;
struct module_sect_attr attrs[0];
@@ -1550,7 +1532,8 @@ static int module_add_modinfo_attrs(struct module *mod)
(attr->test && attr->test(mod))) {
memcpy(temp_attr, attr, sizeof(*temp_attr));
sysfs_attr_init(&temp_attr->attr);
- error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
+ error = sysfs_create_file(&mod->mkobj.kobj,
+ &temp_attr->attr);
++temp_attr;
}
}
@@ -1566,7 +1549,7 @@ static void module_remove_modinfo_attrs(struct module *mod)
/* pick a field to test for end of list */
if (!attr->attr.name)
break;
- sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
+ sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
if (attr->free)
attr->free(mod);
}
@@ -1697,18 +1680,6 @@ static void mod_sysfs_teardown(struct module *mod)
mod_sysfs_fini(mod);
}
-/*
- * unlink the module with the whole machine is stopped with interrupts off
- * - this defends against kallsyms not taking locks
- */
-static int __unlink_module(void *_mod)
-{
- struct module *mod = _mod;
- list_del(&mod->list);
- module_bug_cleanup(mod);
- return 0;
-}
-
#ifdef CONFIG_DEBUG_SET_MODULE_RONX
/*
* LKM RO/NX protection: protect module's text/ro-data
@@ -1842,7 +1813,9 @@ static void free_module(struct module *mod)
/* We leave it in list to prevent duplicate loads, but make sure
* that noone uses it while it's being deconstructed. */
+ mutex_lock(&module_mutex);
mod->state = MODULE_STATE_UNFORMED;
+ mutex_unlock(&module_mutex);
/* Remove dynamic debug info */
ddebug_remove_module(mod->name);
@@ -1858,7 +1831,12 @@ static void free_module(struct module *mod)
/* Now we can delete it from the lists */
mutex_lock(&module_mutex);
- stop_machine(__unlink_module, mod, NULL);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ /* Remove this module from bug list, this uses list_del_rcu */
+ module_bug_cleanup(mod);
+ /* Wait for RCU synchronizing before releasing mod->list and buglist. */
+ synchronize_rcu();
mutex_unlock(&module_mutex);
/* This may be NULL, but that's OK */
@@ -1953,7 +1931,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
/* We compiled with -fno-common. These are not
supposed to happen. */
pr_debug("Common symbol: %s\n", name);
- printk("%s: please compile with -fno-common\n",
+ pr_warn("%s: please compile with -fno-common\n",
mod->name);
ret = -ENOEXEC;
break;
@@ -2257,7 +2235,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
}
static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
- unsigned int shnum)
+ unsigned int shnum)
{
const Elf_Shdr *sec;
@@ -2733,7 +2711,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
* This shouldn't happen with same compiler and binutils
* building all parts of the module.
*/
- printk(KERN_WARNING "%s: has both .ctors and .init_array.\n",
+ pr_warn("%s: has both .ctors and .init_array.\n",
mod->name);
return -EINVAL;
}
@@ -3021,8 +2999,10 @@ static int do_init_module(struct module *mod)
if (mod->init != NULL)
ret = do_one_initcall(mod->init);
if (ret < 0) {
- /* Init routine failed: abort. Try to protect us from
- buggy refcounters. */
+ /*
+ * Init routine failed: abort. Try to protect us from
+ * buggy refcounters.
+ */
mod->state = MODULE_STATE_GOING;
synchronize_sched();
module_put(mod);
@@ -3095,6 +3075,32 @@ static int may_init_module(void)
}
/*
+ * Can't use wait_event_interruptible() because our condition
+ * 'finished_loading()' contains a blocking primitive itself (mutex_lock).
+ */
+static int wait_finished_loading(struct module *mod)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret = 0;
+
+ add_wait_queue(&module_wq, &wait);
+ for (;;) {
+ if (finished_loading(mod->name))
+ break;
+
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+ }
+ remove_wait_queue(&module_wq, &wait);
+
+ return ret;
+}
+
+/*
* We try to place it in the list now to make sure it's unique before
* we dedicate too many resources. In particular, temporary percpu
* memory exhaustion.
@@ -3114,8 +3120,8 @@ again:
|| old->state == MODULE_STATE_UNFORMED) {
/* Wait in case it fails to load. */
mutex_unlock(&module_mutex);
- err = wait_event_interruptible(module_wq,
- finished_loading(mod->name));
+
+ err = wait_finished_loading(mod);
if (err)
goto out_unlocked;
goto again;
@@ -3174,7 +3180,7 @@ out:
static int unknown_module_param_cb(char *param, char *val, const char *modname)
{
- /* Check for magic 'dyndbg' arg */
+ /* Check for magic 'dyndbg' arg */
int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
if (ret != 0)
pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
@@ -3324,6 +3330,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
wake_up_all(&module_wq);
+ /* Wait for RCU synchronizing before releasing mod->list. */
+ synchronize_rcu();
mutex_unlock(&module_mutex);
free_module:
module_deallocate(mod, info);
@@ -3388,7 +3396,7 @@ static inline int is_arm_mapping_symbol(const char *str)
{
if (str[0] == '.' && str[1] == 'L')
return true;
- return str[0] == '$' && strchr("atd", str[1])
+ return str[0] == '$' && strchr("axtd", str[1])
&& (str[2] == '\0' || str[2] == '.');
}
@@ -3657,8 +3665,8 @@ static int m_show(struct seq_file *m, void *p)
/* Informative for users. */
seq_printf(m, " %s",
- mod->state == MODULE_STATE_GOING ? "Unloading":
- mod->state == MODULE_STATE_COMING ? "Loading":
+ mod->state == MODULE_STATE_GOING ? "Unloading" :
+ mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
seq_printf(m, " 0x%pK", mod->module_core);
@@ -3667,7 +3675,7 @@ static int m_show(struct seq_file *m, void *p)
if (mod->taints)
seq_printf(m, " %s", module_flags(mod, buf));
- seq_printf(m, "\n");
+ seq_puts(m, "\n");
return 0;
}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ef42d0ab3115..49746c81ad8d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -220,11 +220,10 @@ void exit_task_namespaces(struct task_struct *p)
SYSCALL_DEFINE2(setns, int, fd, int, nstype)
{
- const struct proc_ns_operations *ops;
struct task_struct *tsk = current;
struct nsproxy *new_nsproxy;
- struct proc_ns *ei;
struct file *file;
+ struct ns_common *ns;
int err;
file = proc_ns_fget(fd);
@@ -232,9 +231,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
return PTR_ERR(file);
err = -EINVAL;
- ei = get_proc_ns(file_inode(file));
- ops = ei->ns_ops;
- if (nstype && (ops->type != nstype))
+ ns = get_proc_ns(file_inode(file));
+ if (nstype && (ns->ops->type != nstype))
goto out;
new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
@@ -243,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
goto out;
}
- err = ops->install(new_nsproxy, ei->ns);
+ err = ns->ops->install(new_nsproxy, ns);
if (err) {
free_nsproxy(new_nsproxy);
goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index d09dc5c32c67..4d8d6f906dec 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -33,6 +33,7 @@ static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
static bool crash_kexec_post_notifiers;
+int panic_on_warn __read_mostly;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -244,6 +245,7 @@ static const struct tnt tnts[] = {
* 'I' - Working around severe firmware bug.
* 'O' - Out-of-tree module has been loaded.
* 'E' - Unsigned module has been loaded.
+ * 'L' - A soft lockup has previously occurred.
*
* The string is overwritten by the next call to print_tainted().
*/
@@ -427,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
if (args)
vprintk(args->fmt, args->args);
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
+ panic("panic_on_warn set ...\n");
+ }
+
print_modules();
dump_stack();
print_oops_end_marker();
@@ -484,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail);
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
+core_param(panic_on_warn, panic_on_warn, int, 0644);
static int __init setup_crash_kexec_post_notifiers(char *s)
{
diff --git a/kernel/params.c b/kernel/params.c
index 34f527023794..0af9b2c4e56c 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -19,6 +19,7 @@
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/device.h>
#include <linux/err.h>
#include <linux/slab.h>
@@ -83,6 +84,15 @@ bool parameq(const char *a, const char *b)
return parameqn(a, b, strlen(a)+1);
}
+static void param_check_unsafe(const struct kernel_param *kp)
+{
+ if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
+ pr_warn("Setting dangerous option %s - tainting kernel\n",
+ kp->name);
+ add_taint(TAINT_USER, LOCKDEP_STILL_OK);
+ }
+}
+
static int parse_one(char *param,
char *val,
const char *doing,
@@ -104,11 +114,12 @@ static int parse_one(char *param,
return 0;
/* No one handled NULL, so do it here. */
if (!val &&
- !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG))
+ !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG))
return -EINVAL;
pr_debug("handling %s with %p\n", param,
params[i].ops->set);
mutex_lock(&param_lock);
+ param_check_unsafe(&params[i]);
err = params[i].ops->set(val, &params[i]);
mutex_unlock(&param_lock);
return err;
@@ -318,7 +329,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp)
EXPORT_SYMBOL(param_get_bool);
struct kernel_param_ops param_ops_bool = {
- .flags = KERNEL_PARAM_FL_NOARG,
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bool,
.get = param_get_bool,
};
@@ -369,7 +380,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
EXPORT_SYMBOL(param_set_bint);
struct kernel_param_ops param_ops_bint = {
- .flags = KERNEL_PARAM_FL_NOARG,
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bint,
.get = param_get_int,
};
@@ -503,8 +514,6 @@ EXPORT_SYMBOL(param_ops_string);
#define to_module_attr(n) container_of(n, struct module_attribute, attr)
#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
-extern struct kernel_param __start___param[], __stop___param[];
-
struct param_attribute
{
struct module_attribute mattr;
@@ -552,6 +561,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
return -EPERM;
mutex_lock(&param_lock);
+ param_check_unsafe(attribute->param);
err = attribute->param->ops->set(buf, attribute->param);
mutex_unlock(&param_lock);
if (!err)
@@ -593,74 +603,67 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
const struct kernel_param *kp,
const char *name)
{
- struct module_param_attrs *new;
- struct attribute **attrs;
- int err, num;
+ struct module_param_attrs *new_mp;
+ struct attribute **new_attrs;
+ unsigned int i;
/* We don't bother calling this with invisible parameters. */
BUG_ON(!kp->perm);
if (!mk->mp) {
- num = 0;
- attrs = NULL;
- } else {
- num = mk->mp->num;
- attrs = mk->mp->grp.attrs;
+ /* First allocation. */
+ mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL);
+ if (!mk->mp)
+ return -ENOMEM;
+ mk->mp->grp.name = "parameters";
+ /* NULL-terminated attribute array. */
+ mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]),
+ GFP_KERNEL);
+ /* Caller will cleanup via free_module_param_attrs */
+ if (!mk->mp->grp.attrs)
+ return -ENOMEM;
}
- /* Enlarge. */
- new = krealloc(mk->mp,
- sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
- GFP_KERNEL);
- if (!new) {
- kfree(attrs);
- err = -ENOMEM;
- goto fail;
- }
- /* Despite looking like the typical realloc() bug, this is safe.
- * We *want* the old 'attrs' to be freed either way, and we'll store
- * the new one in the success case. */
- attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
- if (!attrs) {
- err = -ENOMEM;
- goto fail_free_new;
- }
+ /* Enlarge allocations. */
+ new_mp = krealloc(mk->mp,
+ sizeof(*mk->mp) +
+ sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1),
+ GFP_KERNEL);
+ if (!new_mp)
+ return -ENOMEM;
+ mk->mp = new_mp;
- /* Sysfs wants everything zeroed. */
- memset(new, 0, sizeof(*new));
- memset(&new->attrs[num], 0, sizeof(new->attrs[num]));
- memset(&attrs[num], 0, sizeof(attrs[num]));
- new->grp.name = "parameters";
- new->grp.attrs = attrs;
+ /* Extra pointer for NULL terminator */
+ new_attrs = krealloc(mk->mp->grp.attrs,
+ sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2),
+ GFP_KERNEL);
+ if (!new_attrs)
+ return -ENOMEM;
+ mk->mp->grp.attrs = new_attrs;
/* Tack new one on the end. */
- sysfs_attr_init(&new->attrs[num].mattr.attr);
- new->attrs[num].param = kp;
- new->attrs[num].mattr.show = param_attr_show;
- new->attrs[num].mattr.store = param_attr_store;
- new->attrs[num].mattr.attr.name = (char *)name;
- new->attrs[num].mattr.attr.mode = kp->perm;
- new->num = num+1;
+ sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr);
+ mk->mp->attrs[mk->mp->num].param = kp;
+ mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show;
+ /* Do not allow runtime DAC changes to make param writable. */
+ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
+ mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store;
+ mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name;
+ mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm;
+ mk->mp->num++;
/* Fix up all the pointers, since krealloc can move us */
- for (num = 0; num < new->num; num++)
- new->grp.attrs[num] = &new->attrs[num].mattr.attr;
- new->grp.attrs[num] = NULL;
-
- mk->mp = new;
+ for (i = 0; i < mk->mp->num; i++)
+ mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr;
+ mk->mp->grp.attrs[mk->mp->num] = NULL;
return 0;
-
-fail_free_new:
- kfree(new);
-fail:
- mk->mp = NULL;
- return err;
}
#ifdef CONFIG_MODULES
static void free_module_param_attrs(struct module_kobject *mk)
{
- kfree(mk->mp->grp.attrs);
+ if (mk->mp)
+ kfree(mk->mp->grp.attrs);
kfree(mk->mp);
mk->mp = NULL;
}
@@ -685,8 +688,10 @@ int module_param_sysfs_setup(struct module *mod,
if (kparam[i].perm == 0)
continue;
err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
- if (err)
+ if (err) {
+ free_module_param_attrs(&mod->mkobj);
return err;
+ }
params = true;
}
@@ -763,7 +768,7 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
}
static void __init kernel_add_sysfs_param(const char *name,
- struct kernel_param *kparam,
+ const struct kernel_param *kparam,
unsigned int name_skip)
{
struct module_kobject *mk;
@@ -798,7 +803,7 @@ static void __init kernel_add_sysfs_param(const char *name,
*/
static void __init param_sysfs_builtin(void)
{
- struct kernel_param *kp;
+ const struct kernel_param *kp;
unsigned int name_len;
char modname[MODULE_NAME_LEN];
diff --git a/kernel/pid.c b/kernel/pid.c
index 9b9a26698144..cd36a5e0d173 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -79,7 +79,10 @@ struct pid_namespace init_pid_ns = {
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
- .proc_inum = PROC_PID_INIT_INO,
+ .ns.inum = PROC_PID_INIT_INO,
+#ifdef CONFIG_PID_NS
+ .ns.ops = &pidns_operations,
+#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);
@@ -341,6 +344,8 @@ out:
out_unlock:
spin_unlock_irq(&pidmap_lock);
+ put_pid_ns(ns);
+
out_free:
while (++i <= ns->level)
free_pidmap(pid->numbers + i);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index db95d8eb761b..a65ba137fd15 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -105,9 +105,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns->pid_cachep == NULL)
goto out_free_map;
- err = proc_alloc_inum(&ns->proc_inum);
+ err = ns_alloc_inum(&ns->ns);
if (err)
goto out_free_map;
+ ns->ns.ops = &pidns_operations;
kref_init(&ns->kref);
ns->level = level;
@@ -142,7 +143,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
{
int i;
- proc_free_inum(ns->proc_inum);
+ ns_free_inum(&ns->ns);
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
put_user_ns(ns->user_ns);
@@ -190,7 +191,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
- /* Ignore SIGCHLD causing any terminated children to autoreap */
+ /*
+ * Ignore SIGCHLD causing any terminated children to autoreap.
+ * This speeds up the namespace shutdown, plus see the comment
+ * below.
+ */
spin_lock_irq(&me->sighand->siglock);
me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
spin_unlock_irq(&me->sighand->siglock);
@@ -223,15 +228,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
}
read_unlock(&tasklist_lock);
- /* Firstly reap the EXIT_ZOMBIE children we may have. */
+ /*
+ * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
+ * sys_wait4() will also block until our children traced from the
+ * parent namespace are detached and become EXIT_DEAD.
+ */
do {
clear_thread_flag(TIF_SIGPENDING);
rc = sys_wait4(-1, NULL, __WALL, NULL);
} while (rc != -ECHILD);
/*
- * sys_wait4() above can't reap the TASK_DEAD children.
- * Make sure they all go away, see free_pid().
+ * sys_wait4() above can't reap the EXIT_DEAD children but we do not
+ * really care, we could reparent them to the global init. We could
+ * exit and reap ->child_reaper even if it is not the last thread in
+ * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+ * pid_ns can not go away until proc_kill_sb() drops the reference.
+ *
+ * But this ns can also have other tasks injected by setns()+fork().
+ * Again, ignoring the user visible semantics we do not really need
+ * to wait until they are all reaped, but they can be reparented to
+ * us and thus we need to ensure that pid->child_reaper stays valid
+ * until they all go away. See free_pid()->wake_up_process().
+ *
+ * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
+ * if reparented.
*/
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -313,7 +334,12 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
return 0;
}
-static void *pidns_get(struct task_struct *task)
+static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct pid_namespace, ns);
+}
+
+static struct ns_common *pidns_get(struct task_struct *task)
{
struct pid_namespace *ns;
@@ -323,18 +349,18 @@ static void *pidns_get(struct task_struct *task)
get_pid_ns(ns);
rcu_read_unlock();
- return ns;
+ return ns ? &ns->ns : NULL;
}
-static void pidns_put(void *ns)
+static void pidns_put(struct ns_common *ns)
{
- put_pid_ns(ns);
+ put_pid_ns(to_pid_ns(ns));
}
-static int pidns_install(struct nsproxy *nsproxy, void *ns)
+static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
struct pid_namespace *active = task_active_pid_ns(current);
- struct pid_namespace *ancestor, *new = ns;
+ struct pid_namespace *ancestor, *new = to_pid_ns(ns);
if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -362,19 +388,12 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
return 0;
}
-static unsigned int pidns_inum(void *ns)
-{
- struct pid_namespace *pid_ns = ns;
- return pid_ns->proc_inum;
-}
-
const struct proc_ns_operations pidns_operations = {
.name = "pid",
.type = CLONE_NEWPID,
.get = pidns_get,
.put = pidns_put,
.install = pidns_install,
- .inum = pidns_inum,
};
static __init int pid_namespaces_init(void)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index e4e4121fa327..48b28d387c7f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -94,6 +94,7 @@ config PM_STD_PARTITION
config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
+ select PM
config PM_SLEEP_SMP
def_bool y
@@ -129,24 +130,19 @@ config PM_WAKELOCKS_GC
depends on PM_WAKELOCKS
default y
-config PM_RUNTIME
- bool "Run-time PM core functionality"
- depends on !IA64_HP_SIM
+config PM
+ bool "Device power management core functionality"
---help---
Enable functionality allowing I/O devices to be put into energy-saving
- (low power) states at run time (or autosuspended) after a specified
- period of inactivity and woken up in response to a hardware-generated
+ (low power) states, for example after a specified period of inactivity
+ (autosuspended), and woken up in response to a hardware-generated
wake-up event or a driver's request.
Hardware support is generally required for this functionality to work
and the bus type drivers of the buses the devices are on are
- responsible for the actual handling of the autosuspend requests and
+ responsible for the actual handling of device suspend requests and
wake-up events.
-config PM
- def_bool y
- depends on PM_SLEEP || PM_RUNTIME
-
config PM_DEBUG
bool "Power Management Debug Support"
depends on PM
@@ -298,10 +294,9 @@ config PM_GENERIC_DOMAINS_SLEEP
def_bool y
depends on PM_SLEEP && PM_GENERIC_DOMAINS
-config PM_GENERIC_DOMAINS_RUNTIME
+config PM_GENERIC_DOMAINS_OF
def_bool y
- depends on PM_RUNTIME && PM_GENERIC_DOMAINS
+ depends on PM_GENERIC_DOMAINS && OF
config CPU_PM
bool
- depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a9dfa79b6bab..2329daae5255 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,6 +28,7 @@
#include <linux/syscore_ops.h>
#include <linux/ctype.h>
#include <linux/genhd.h>
+#include <linux/ktime.h>
#include <trace/events/power.h>
#include "power.h"
@@ -232,20 +233,17 @@ static void platform_recover(int platform_mode)
* @nr_pages: Number of memory pages processed between @start and @stop.
* @msg: Additional diagnostic message to print.
*/
-void swsusp_show_speed(struct timeval *start, struct timeval *stop,
- unsigned nr_pages, char *msg)
+void swsusp_show_speed(ktime_t start, ktime_t stop,
+ unsigned nr_pages, char *msg)
{
+ ktime_t diff;
u64 elapsed_centisecs64;
unsigned int centisecs;
unsigned int k;
unsigned int kps;
- elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
- /*
- * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
- * it is obvious enough for what went wrong.
- */
- do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+ diff = ktime_sub(stop, start);
+ elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC);
centisecs = elapsed_centisecs64;
if (centisecs == 0)
centisecs = 1; /* avoid div-by-zero */
@@ -502,8 +500,14 @@ int hibernation_restore(int platform_mode)
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
error = resume_target_kernel(platform_mode);
- dpm_resume_end(PMSG_RECOVER);
+ /*
+ * The above should either succeed and jump to the new kernel,
+ * or return with an error. Otherwise things are just
+ * undefined, so let's be paranoid.
+ */
+ BUG_ON(!error);
}
+ dpm_resume_end(PMSG_RECOVER);
pm_restore_gfp_mask();
resume_console();
pm_restore_console();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 2df883a9d3cb..ce9b8328a689 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -174,8 +174,7 @@ extern int hib_wait_on_bio_chain(struct bio **bio_chain);
struct timeval;
/* kernel/power/swsusp.c */
-extern void swsusp_show_speed(struct timeval *, struct timeval *,
- unsigned int, char *);
+extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
#ifdef CONFIG_SUSPEND
/* kernel/power/suspend.c */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 4ee194eb524b..5a6ec8678b9a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only)
while (true) {
todo = 0;
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (p == current || !freeze_task(p))
continue;
if (!freezer_should_skip(p))
todo++;
- } while_each_thread(g, p);
+ }
read_unlock(&tasklist_lock);
if (!user_only) {
@@ -93,11 +93,11 @@ static int try_to_freeze_tasks(bool user_only)
if (!wakeup) {
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (p != current && !freezer_should_skip(p)
&& freezing(p) && !frozen(p))
sched_show_task(p);
- } while_each_thread(g, p);
+ }
read_unlock(&tasklist_lock);
}
} else {
@@ -108,6 +108,30 @@ static int try_to_freeze_tasks(bool user_only)
return todo ? -EBUSY : 0;
}
+static bool __check_frozen_processes(void)
+{
+ struct task_struct *g, *p;
+
+ for_each_process_thread(g, p)
+ if (p != current && !freezer_should_skip(p) && !frozen(p))
+ return false;
+
+ return true;
+}
+
+/*
+ * Returns true if all freezable tasks (except for current) are frozen already
+ */
+static bool check_frozen_processes(void)
+{
+ bool ret;
+
+ read_lock(&tasklist_lock);
+ ret = __check_frozen_processes();
+ read_unlock(&tasklist_lock);
+ return ret;
+}
+
/**
* freeze_processes - Signal user space processes to enter the refrigerator.
* The current thread will not be frozen. The same process that calls
@@ -118,6 +142,7 @@ static int try_to_freeze_tasks(bool user_only)
int freeze_processes(void)
{
int error;
+ int oom_kills_saved;
error = __usermodehelper_disable(UMH_FREEZING);
if (error)
@@ -129,13 +154,28 @@ int freeze_processes(void)
if (!pm_freezing)
atomic_inc(&system_freezing_cnt);
+ pm_wakeup_clear();
printk("Freezing user space processes ... ");
pm_freezing = true;
+ oom_kills_saved = oom_kills_count();
error = try_to_freeze_tasks(true);
if (!error) {
- printk("done.");
__usermodehelper_set_disable_depth(UMH_DISABLED);
oom_killer_disable();
+
+ /*
+ * There might have been an OOM kill while we were
+ * freezing tasks and the killed task might be still
+ * on the way out so we have to double check for race.
+ */
+ if (oom_kills_count() != oom_kills_saved &&
+ !check_frozen_processes()) {
+ __usermodehelper_set_disable_depth(UMH_ENABLED);
+ printk("OOM in progress.");
+ error = -EBUSY;
+ } else {
+ printk("done.");
+ }
}
printk("\n");
BUG_ON(in_atomic());
@@ -190,11 +230,11 @@ void thaw_processes(void)
thaw_workqueues();
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
/* No other threads should have PF_SUSPEND_TASK set */
WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK));
__thaw_task(p);
- } while_each_thread(g, p);
+ }
read_unlock(&tasklist_lock);
WARN_ON(!(curr->flags & PF_SUSPEND_TASK));
@@ -217,10 +257,10 @@ void thaw_kernel_threads(void)
thaw_workqueues();
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
__thaw_task(p);
- } while_each_thread(g, p);
+ }
read_unlock(&tasklist_lock);
schedule();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 884b77058864..5f4c006c4b1e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -105,11 +105,27 @@ static struct pm_qos_object network_throughput_pm_qos = {
};
+static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier);
+static struct pm_qos_constraints memory_bw_constraints = {
+ .list = PLIST_HEAD_INIT(memory_bw_constraints.list),
+ .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
+ .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
+ .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
+ .type = PM_QOS_SUM,
+ .notifiers = &memory_bandwidth_notifier,
+};
+static struct pm_qos_object memory_bandwidth_pm_qos = {
+ .constraints = &memory_bw_constraints,
+ .name = "memory_bandwidth",
+};
+
+
static struct pm_qos_object *pm_qos_array[] = {
&null_pm_qos,
&cpu_dma_pm_qos,
&network_lat_pm_qos,
- &network_throughput_pm_qos
+ &network_throughput_pm_qos,
+ &memory_bandwidth_pm_qos,
};
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
@@ -130,6 +146,9 @@ static const struct file_operations pm_qos_power_fops = {
/* unlocked internal variant */
static inline int pm_qos_get_value(struct pm_qos_constraints *c)
{
+ struct plist_node *node;
+ int total_value = 0;
+
if (plist_head_empty(&c->list))
return c->no_constraint_value;
@@ -140,6 +159,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c)
case PM_QOS_MAX:
return plist_last(&c->list)->prio;
+ case PM_QOS_SUM:
+ plist_for_each(node, &c->list)
+ total_value += node->prio;
+
+ return total_value;
+
default:
/* runtime check for not using enum */
BUG();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f1604d8cf489..0c40c16174b4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -28,6 +28,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/compiler.h>
+#include <linux/ktime.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -725,6 +726,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
clear_bit(bit, addr);
}
+static void memory_bm_clear_current(struct memory_bitmap *bm)
+{
+ int bit;
+
+ bit = max(bm->cur.node_bit - 1, 0);
+ clear_bit(bit, bm->cur.node->data);
+}
+
static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
@@ -1333,23 +1342,39 @@ static struct memory_bitmap copy_bm;
void swsusp_free(void)
{
- struct zone *zone;
- unsigned long pfn, max_zone_pfn;
+ unsigned long fb_pfn, fr_pfn;
- for_each_populated_zone(zone) {
- max_zone_pfn = zone_end_pfn(zone);
- for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (pfn_valid(pfn)) {
- struct page *page = pfn_to_page(pfn);
-
- if (swsusp_page_is_forbidden(page) &&
- swsusp_page_is_free(page)) {
- swsusp_unset_page_forbidden(page);
- swsusp_unset_page_free(page);
- __free_page(page);
- }
- }
+ if (!forbidden_pages_map || !free_pages_map)
+ goto out;
+
+ memory_bm_position_reset(forbidden_pages_map);
+ memory_bm_position_reset(free_pages_map);
+
+loop:
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
+
+ /*
+ * Find the next bit set in both bitmaps. This is guaranteed to
+ * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP.
+ */
+ do {
+ if (fb_pfn < fr_pfn)
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
+ if (fr_pfn < fb_pfn)
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
+ } while (fb_pfn != fr_pfn);
+
+ if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
+ struct page *page = pfn_to_page(fr_pfn);
+
+ memory_bm_clear_current(forbidden_pages_map);
+ memory_bm_clear_current(free_pages_map);
+ __free_page(page);
+ goto loop;
}
+
+out:
nr_copy_pages = 0;
nr_meta_pages = 0;
restore_pblist = NULL;
@@ -1552,11 +1577,11 @@ int hibernate_preallocate_memory(void)
struct zone *zone;
unsigned long saveable, size, max_size, count, highmem, pages = 0;
unsigned long alloc, save_highmem, pages_highmem, avail_normal;
- struct timeval start, stop;
+ ktime_t start, stop;
int error;
printk(KERN_INFO "PM: Preallocating image memory... ");
- do_gettimeofday(&start);
+ start = ktime_get();
error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
if (error)
@@ -1685,9 +1710,9 @@ int hibernate_preallocate_memory(void)
free_unnecessary_pages();
out:
- do_gettimeofday(&stop);
+ stop = ktime_get();
printk(KERN_CONT "done (allocated %lu pages)\n", pages);
- swsusp_show_speed(&start, &stop, pages, "Allocated");
+ swsusp_show_speed(start, stop, pages, "Allocated");
return 0;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 18c62195660f..c347e3ce3a55 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -146,17 +146,29 @@ static int platform_suspend_prepare(suspend_state_t state)
static int platform_suspend_prepare_late(suspend_state_t state)
{
+ return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ?
+ freeze_ops->prepare() : 0;
+}
+
+static int platform_suspend_prepare_noirq(suspend_state_t state)
+{
return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
suspend_ops->prepare_late() : 0;
}
-static void platform_suspend_wake(suspend_state_t state)
+static void platform_resume_noirq(suspend_state_t state)
{
if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
suspend_ops->wake();
}
-static void platform_suspend_finish(suspend_state_t state)
+static void platform_resume_early(suspend_state_t state)
+{
+ if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore)
+ freeze_ops->restore();
+}
+
+static void platform_resume_finish(suspend_state_t state)
{
if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
suspend_ops->finish();
@@ -172,7 +184,7 @@ static int platform_suspend_begin(suspend_state_t state)
return 0;
}
-static void platform_suspend_end(suspend_state_t state)
+static void platform_resume_end(suspend_state_t state)
{
if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
freeze_ops->end();
@@ -180,7 +192,7 @@ static void platform_suspend_end(suspend_state_t state)
suspend_ops->end();
}
-static void platform_suspend_recover(suspend_state_t state)
+static void platform_recover(suspend_state_t state)
{
if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
suspend_ops->recover();
@@ -265,13 +277,22 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (error)
goto Platform_finish;
- error = dpm_suspend_end(PMSG_SUSPEND);
+ error = dpm_suspend_late(PMSG_SUSPEND);
if (error) {
- printk(KERN_ERR "PM: Some devices failed to power down\n");
+ printk(KERN_ERR "PM: late suspend of devices failed\n");
goto Platform_finish;
}
error = platform_suspend_prepare_late(state);
if (error)
+ goto Devices_early_resume;
+
+ error = dpm_suspend_noirq(PMSG_SUSPEND);
+ if (error) {
+ printk(KERN_ERR "PM: noirq suspend of devices failed\n");
+ goto Platform_early_resume;
+ }
+ error = platform_suspend_prepare_noirq(state);
+ if (error)
goto Platform_wake;
if (suspend_test(TEST_PLATFORM))
@@ -318,11 +339,17 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
enable_nonboot_cpus();
Platform_wake:
- platform_suspend_wake(state);
- dpm_resume_start(PMSG_RESUME);
+ platform_resume_noirq(state);
+ dpm_resume_noirq(PMSG_RESUME);
+
+ Platform_early_resume:
+ platform_resume_early(state);
+
+ Devices_early_resume:
+ dpm_resume_early(PMSG_RESUME);
Platform_finish:
- platform_suspend_finish(state);
+ platform_resume_finish(state);
return error;
}
@@ -361,14 +388,16 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
+ trace_suspend_resume(TPS("resume_console"), state, true);
resume_console();
+ trace_suspend_resume(TPS("resume_console"), state, false);
Close:
- platform_suspend_end(state);
+ platform_resume_end(state);
return error;
Recover_platform:
- platform_suspend_recover(state);
+ platform_recover(state);
goto Resume_devices;
}
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index bd91bc177c93..084452e34a12 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -22,6 +22,8 @@
#define TEST_SUSPEND_SECONDS 10
static unsigned long suspend_test_start_time;
+static u32 test_repeat_count_max = 1;
+static u32 test_repeat_count_current;
void suspend_test_start(void)
{
@@ -74,6 +76,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
int status;
/* this may fail if the RTC hasn't been initialized */
+repeat:
status = rtc_read_time(rtc, &alm.time);
if (status < 0) {
printk(err_readtime, dev_name(&rtc->dev), status);
@@ -100,10 +103,21 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
if (state == PM_SUSPEND_STANDBY) {
printk(info_test, pm_states[state]);
status = pm_suspend(state);
+ if (status < 0)
+ state = PM_SUSPEND_FREEZE;
}
+ if (state == PM_SUSPEND_FREEZE) {
+ printk(info_test, pm_states[state]);
+ status = pm_suspend(state);
+ }
+
if (status < 0)
printk(err_suspend, status);
+ test_repeat_count_current++;
+ if (test_repeat_count_current < test_repeat_count_max)
+ goto repeat;
+
/* Some platforms can't detect that the alarm triggered the
* wakeup, or (accordingly) disable it after it afterwards.
* It's supposed to give oneshot behavior; cope.
@@ -137,16 +151,28 @@ static char warn_bad_state[] __initdata =
static int __init setup_test_suspend(char *value)
{
int i;
+ char *repeat;
+ char *suspend_type;
- /* "=mem" ==> "mem" */
+ /* example : "=mem[,N]" ==> "mem[,N]" */
value++;
+ suspend_type = strsep(&value, ",");
+ if (!suspend_type)
+ return 0;
+
+ repeat = strsep(&value, ",");
+ if (repeat) {
+ if (kstrtou32(repeat, 0, &test_repeat_count_max))
+ return 0;
+ }
+
for (i = 0; pm_labels[i]; i++)
- if (!strcmp(pm_labels[i], value)) {
+ if (!strcmp(pm_labels[i], suspend_type)) {
test_state_label = pm_labels[i];
return 0;
}
- printk(warn_bad_state, value);
+ printk(warn_bad_state, suspend_type);
return 0;
}
__setup("test_suspend", setup_test_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index aaa3261dea5d..570aff817543 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -30,6 +30,7 @@
#include <linux/atomic.h>
#include <linux/kthread.h>
#include <linux/crc32.h>
+#include <linux/ktime.h>
#include "power.h"
@@ -445,8 +446,8 @@ static int save_image(struct swap_map_handle *handle,
int nr_pages;
int err2;
struct bio *bio;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
nr_to_write);
@@ -455,7 +456,7 @@ static int save_image(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
while (1) {
ret = snapshot_read_next(snapshot);
if (ret <= 0)
@@ -469,12 +470,12 @@ static int save_image(struct swap_map_handle *handle,
nr_pages++;
}
err2 = hib_wait_on_bio_chain(&bio);
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret)
ret = err2;
if (!ret)
printk(KERN_INFO "PM: Image saving done.\n");
- swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+ swsusp_show_speed(start, stop, nr_to_write, "Wrote");
return ret;
}
@@ -580,8 +581,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
int nr_pages;
int err2;
struct bio *bio;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
size_t off;
unsigned thr, run_threads, nr_threads;
unsigned char *page = NULL;
@@ -674,7 +675,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
for (;;) {
for (thr = 0; thr < nr_threads; thr++) {
for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
@@ -759,12 +760,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
out_finish:
err2 = hib_wait_on_bio_chain(&bio);
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret)
ret = err2;
if (!ret)
printk(KERN_INFO "PM: Image saving done.\n");
- swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+ swsusp_show_speed(start, stop, nr_to_write, "Wrote");
out_clean:
if (crc) {
if (crc->thr)
@@ -965,8 +966,8 @@ static int load_image(struct swap_map_handle *handle,
{
unsigned int m;
int ret = 0;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
struct bio *bio;
int err2;
unsigned nr_pages;
@@ -978,7 +979,7 @@ static int load_image(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
for ( ; ; ) {
ret = snapshot_write_next(snapshot);
if (ret <= 0)
@@ -996,7 +997,7 @@ static int load_image(struct swap_map_handle *handle,
nr_pages++;
}
err2 = hib_wait_on_bio_chain(&bio);
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret)
ret = err2;
if (!ret) {
@@ -1005,7 +1006,7 @@ static int load_image(struct swap_map_handle *handle,
if (!snapshot_image_loaded(snapshot))
ret = -ENODATA;
}
- swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+ swsusp_show_speed(start, stop, nr_to_read, "Read");
return ret;
}
@@ -1067,8 +1068,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
int ret = 0;
int eof = 0;
struct bio *bio;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
unsigned nr_pages;
size_t off;
unsigned i, thr, run_threads, nr_threads;
@@ -1190,7 +1191,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
ret = snapshot_write_next(snapshot);
if (ret <= 0)
@@ -1343,7 +1344,7 @@ out_finish:
wait_event(crc->done, atomic_read(&crc->stop));
atomic_set(&crc->stop, 0);
}
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret) {
printk(KERN_INFO "PM: Image loading done.\n");
snapshot_write_finalize(snapshot);
@@ -1359,7 +1360,7 @@ out_finish:
}
}
}
- swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+ swsusp_show_speed(start, stop, nr_to_read, "Read");
out_clean:
for (i = 0; i < ring_size; i++)
free_page((unsigned long)page[i]);
@@ -1374,7 +1375,7 @@ out_clean:
kthread_stop(data[thr].thr);
vfree(data);
}
- if (page) vfree(page);
+ vfree(page);
return ret;
}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1ce770687ea8..02d6b6d28796 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -62,9 +62,6 @@ int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
};
-/* Deferred messaged from sched code are marked by this special level */
-#define SCHED_MESSAGE_LOGLEVEL -2
-
/*
* Low level drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
@@ -267,7 +264,6 @@ static u32 clear_idx;
#define LOG_ALIGN __alignof__(struct printk_log)
#endif
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
-#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
@@ -481,7 +477,7 @@ static int syslog_action_restricted(int type)
type != SYSLOG_ACTION_SIZE_BUFFER;
}
-static int check_syslog_permissions(int type, bool from_file)
+int check_syslog_permissions(int type, bool from_file)
{
/*
* If this is from /proc/kmsg and we've already opened it, then we've
@@ -519,14 +515,13 @@ struct devkmsg_user {
char buf[8192];
};
-static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
- unsigned long count, loff_t pos)
+static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
{
char *buf, *line;
int i;
int level = default_message_loglevel;
int facility = 1; /* LOG_USER */
- size_t len = iov_length(iv, count);
+ size_t len = iocb->ki_nbytes;
ssize_t ret = len;
if (len > LOG_LINE_MAX)
@@ -535,13 +530,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
if (buf == NULL)
return -ENOMEM;
- line = buf;
- for (i = 0; i < count; i++) {
- if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) {
- ret = -EFAULT;
- goto out;
- }
- line += iv[i].iov_len;
+ buf[len] = '\0';
+ if (copy_from_iter(buf, len, from) != len) {
+ kfree(buf);
+ return -EFAULT;
}
/*
@@ -567,10 +559,8 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
line = endp;
}
}
- line[len] = '\0';
printk_emit(facility, level, NULL, 0, "%s", line);
-out:
kfree(buf);
return ret;
}
@@ -802,7 +792,7 @@ static int devkmsg_release(struct inode *inode, struct file *file)
const struct file_operations kmsg_fops = {
.open = devkmsg_open,
.read = devkmsg_read,
- .aio_write = devkmsg_writev,
+ .write_iter = devkmsg_write,
.llseek = devkmsg_llseek,
.poll = devkmsg_poll,
.release = devkmsg_release,
@@ -858,6 +848,9 @@ static int __init log_buf_len_setup(char *str)
}
early_param("log_buf_len", log_buf_len_setup);
+#ifdef CONFIG_SMP
+#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
+
static void __init log_buf_add_cpu(void)
{
unsigned int cpu_extra;
@@ -884,6 +877,9 @@ static void __init log_buf_add_cpu(void)
log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
}
+#else /* !CONFIG_SMP */
+static inline void log_buf_add_cpu(void) {}
+#endif /* CONFIG_SMP */
void __init setup_log_buf(int early)
{
@@ -1260,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
int do_syslog(int type, char __user *buf, int len, bool from_file)
{
bool clear = false;
- static int saved_console_loglevel = -1;
+ static int saved_console_loglevel = LOGLEVEL_DEFAULT;
int error;
error = check_syslog_permissions(type, from_file);
@@ -1317,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
break;
/* Disable logging to console */
case SYSLOG_ACTION_CONSOLE_OFF:
- if (saved_console_loglevel == -1)
+ if (saved_console_loglevel == LOGLEVEL_DEFAULT)
saved_console_loglevel = console_loglevel;
console_loglevel = minimum_console_loglevel;
break;
/* Enable logging to console */
case SYSLOG_ACTION_CONSOLE_ON:
- if (saved_console_loglevel != -1) {
+ if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
console_loglevel = saved_console_loglevel;
- saved_console_loglevel = -1;
+ saved_console_loglevel = LOGLEVEL_DEFAULT;
}
break;
/* Set level of messages printed to console */
@@ -1337,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
len = minimum_console_loglevel;
console_loglevel = len;
/* Implicitly re-enable logging to console */
- saved_console_loglevel = -1;
+ saved_console_loglevel = LOGLEVEL_DEFAULT;
error = 0;
break;
/* Number of chars in the log buffer */
@@ -1628,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level,
int printed_len = 0;
bool in_sched = false;
/* cpu currently holding logbuf_lock in this function */
- static volatile unsigned int logbuf_cpu = UINT_MAX;
+ static unsigned int logbuf_cpu = UINT_MAX;
- if (level == SCHED_MESSAGE_LOGLEVEL) {
- level = -1;
+ if (level == LOGLEVEL_SCHED) {
+ level = LOGLEVEL_DEFAULT;
in_sched = true;
}
@@ -1680,12 +1676,7 @@ asmlinkage int vprintk_emit(int facility, int level,
* The printf needs to come first; we need the syslog
* prefix which might be passed-in as a parameter.
*/
- if (in_sched)
- text_len = scnprintf(text, sizeof(textbuf),
- KERN_WARNING "[sched_delayed] ");
-
- text_len += vscnprintf(text + text_len,
- sizeof(textbuf) - text_len, fmt, args);
+ text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
/* mark and strip a trailing newline */
if (text_len && text[text_len-1] == '\n') {
@@ -1701,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level,
const char *end_of_header = printk_skip_level(text);
switch (kern_level) {
case '0' ... '7':
- if (level == -1)
+ if (level == LOGLEVEL_DEFAULT)
level = kern_level - '0';
+ /* fallthrough */
case 'd': /* KERN_DEFAULT */
lflags |= LOG_PREFIX;
}
@@ -1716,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level,
}
}
- if (level == -1)
+ if (level == LOGLEVEL_DEFAULT)
level = default_message_loglevel;
if (dict)
@@ -1794,7 +1786,7 @@ EXPORT_SYMBOL(vprintk_emit);
asmlinkage int vprintk(const char *fmt, va_list args)
{
- return vprintk_emit(0, -1, NULL, 0, fmt, args);
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
}
EXPORT_SYMBOL(vprintk);
@@ -1813,6 +1805,30 @@ asmlinkage int printk_emit(int facility, int level,
}
EXPORT_SYMBOL(printk_emit);
+int vprintk_default(const char *fmt, va_list args)
+{
+ int r;
+
+#ifdef CONFIG_KGDB_KDB
+ if (unlikely(kdb_trap_printk)) {
+ r = vkdb_printf(fmt, args);
+ return r;
+ }
+#endif
+ r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(vprintk_default);
+
+/*
+ * This allows printk to be diverted to another function per cpu.
+ * This is useful for calling printk functions from within NMI
+ * without worrying about race conditions that can lock up the
+ * box.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+
/**
* printk - print a kernel message
* @fmt: format string
@@ -1836,19 +1852,21 @@ EXPORT_SYMBOL(printk_emit);
*/
asmlinkage __visible int printk(const char *fmt, ...)
{
+ printk_func_t vprintk_func;
va_list args;
int r;
-#ifdef CONFIG_KGDB_KDB
- if (unlikely(kdb_trap_printk)) {
- va_start(args, fmt);
- r = vkdb_printf(fmt, args);
- va_end(args);
- return r;
- }
-#endif
va_start(args, fmt);
- r = vprintk_emit(0, -1, NULL, 0, fmt, args);
+
+ /*
+ * If a caller overrides the per_cpu printk_func, then it needs
+ * to disable preemption when calling printk(). Otherwise
+ * the printk_func should be set to the default. No need to
+ * disable preemption here.
+ */
+ vprintk_func = this_cpu_read(printk_func);
+ r = vprintk_func(fmt, args);
+
va_end(args);
return r;
@@ -1882,28 +1900,28 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
bool syslog, char *buf, size_t size) { return 0; }
static size_t cont_print_text(char *text, size_t size) { return 0; }
+/* Still needs to be defined for users */
+DEFINE_PER_CPU(printk_func_t, printk_func);
+
#endif /* CONFIG_PRINTK */
#ifdef CONFIG_EARLY_PRINTK
struct console *early_console;
-void early_vprintk(const char *fmt, va_list ap)
-{
- if (early_console) {
- char buf[512];
- int n = vscnprintf(buf, sizeof(buf), fmt, ap);
-
- early_console->write(early_console, buf, n);
- }
-}
-
asmlinkage __visible void early_printk(const char *fmt, ...)
{
va_list ap;
+ char buf[512];
+ int n;
+
+ if (!early_console)
+ return;
va_start(ap, fmt);
- early_vprintk(fmt, ap);
+ n = vscnprintf(buf, sizeof(buf), fmt, ap);
va_end(ap);
+
+ early_console->write(early_console, buf, n);
}
#endif
@@ -2628,7 +2646,7 @@ void wake_up_klogd(void)
preempt_disable();
if (waitqueue_active(&log_wait)) {
this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
- irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+ irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
}
preempt_enable();
}
@@ -2640,11 +2658,11 @@ int printk_deferred(const char *fmt, ...)
preempt_disable();
va_start(args, fmt);
- r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args);
+ r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
va_end(args);
__this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
- irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+ irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
preempt_enable();
return r;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 54e75226c2c4..1eb9d90c3af9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
/*
* Detach all tasks we were using ptrace on. Called with tasklist held
- * for writing, and returns with it held too. But note it can release
- * and reacquire the lock.
+ * for writing.
*/
-void exit_ptrace(struct task_struct *tracer)
- __releases(&tasklist_lock)
- __acquires(&tasklist_lock)
+void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
{
struct task_struct *p, *n;
- LIST_HEAD(ptrace_dead);
-
- if (likely(list_empty(&tracer->ptraced)))
- return;
list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
if (unlikely(p->ptrace & PT_EXITKILL))
send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
if (__ptrace_detach(tracer, p))
- list_add(&p->ptrace_entry, &ptrace_dead);
- }
-
- write_unlock_irq(&tasklist_lock);
- BUG_ON(!list_empty(&tracer->ptraced));
-
- list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
- list_del_init(&p->ptrace_entry);
- release_task(p);
+ list_add(&p->ptrace_entry, dead);
}
-
- write_lock_irq(&tasklist_lock);
}
int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 807ccfbf69b3..e6fae503d1bc 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,6 +1,6 @@
obj-y += update.o srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += tree.o
-obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
+obj-$(CONFIG_PREEMPT_RCU) += tree.o
obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index ff1a6de62f17..07bb02eda844 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -135,4 +135,6 @@ int rcu_jiffies_till_stall_check(void);
*/
#define TPS(x) tracepoint_string(x)
+void rcu_early_boot_tests(void);
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 948a7693748e..4d559baf06e0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -49,11 +49,19 @@
#include <linux/trace_clock.h>
#include <asm/byteorder.h>
#include <linux/torture.h>
+#include <linux/vmalloc.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
+torture_param(int, cbflood_inter_holdoff, HZ,
+ "Holdoff between floods (jiffies)");
+torture_param(int, cbflood_intra_holdoff, 1,
+ "Holdoff between bursts (jiffies)");
+torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable");
+torture_param(int, cbflood_n_per_burst, 20000,
+ "# callbacks per burst in flood");
torture_param(int, fqs_duration, 0,
"Duration of fqs bursts (us), 0 to disable");
torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
@@ -96,10 +104,12 @@ module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
static int nrealreaders;
+static int ncbflooders;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
static struct task_struct *stats_task;
+static struct task_struct **cbflood_task;
static struct task_struct *fqs_task;
static struct task_struct *boost_tasks[NR_CPUS];
static struct task_struct *stall_task;
@@ -138,6 +148,7 @@ static long n_rcu_torture_boosts;
static long n_rcu_torture_timers;
static long n_barrier_attempts;
static long n_barrier_successes;
+static atomic_long_t n_cbfloods;
static struct list_head rcu_torture_removed;
static int rcu_torture_writer_state;
@@ -157,9 +168,9 @@ static int rcu_torture_writer_state;
#else
#define RCUTORTURE_RUNNABLE_INIT 0
#endif
-int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
-module_param(rcutorture_runnable, int, 0444);
-MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
+static int torture_runnable = RCUTORTURE_RUNNABLE_INIT;
+module_param(torture_runnable, int, 0444);
+MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
#define rcu_can_boost() 1
@@ -182,7 +193,7 @@ static u64 notrace rcu_trace_clock_local(void)
#endif /* #else #ifdef CONFIG_RCU_TRACE */
static unsigned long boost_starttime; /* jiffies of next boost test start. */
-DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
+static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
/* and boost task create/destroy. */
static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
static bool barrier_phase; /* Test phase. */
@@ -242,7 +253,7 @@ struct rcu_torture_ops {
void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
void (*cb_barrier)(void);
void (*fqs)(void);
- void (*stats)(char *page);
+ void (*stats)(void);
int irq_capable;
int can_boost;
const char *name;
@@ -525,21 +536,21 @@ static void srcu_torture_barrier(void)
srcu_barrier(&srcu_ctl);
}
-static void srcu_torture_stats(char *page)
+static void srcu_torture_stats(void)
{
int cpu;
int idx = srcu_ctl.completed & 0x1;
- page += sprintf(page, "%s%s per-CPU(idx=%d):",
- torture_type, TORTURE_FLAG, idx);
+ pr_alert("%s%s per-CPU(idx=%d):",
+ torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
long c0, c1;
c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
- page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1);
+ pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
}
- sprintf(page, "\n");
+ pr_cont("\n");
}
static void srcu_torture_synchronize_expedited(void)
@@ -601,6 +612,52 @@ static struct rcu_torture_ops sched_ops = {
.name = "sched"
};
+#ifdef CONFIG_TASKS_RCU
+
+/*
+ * Definitions for RCU-tasks torture testing.
+ */
+
+static int tasks_torture_read_lock(void)
+{
+ return 0;
+}
+
+static void tasks_torture_read_unlock(int idx)
+{
+}
+
+static void rcu_tasks_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops tasks_ops = {
+ .ttype = RCU_TASKS_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = tasks_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = tasks_torture_read_unlock,
+ .completed = rcu_no_completed,
+ .deferred_free = rcu_tasks_torture_deferred_free,
+ .sync = synchronize_rcu_tasks,
+ .exp_sync = synchronize_rcu_tasks,
+ .call = call_rcu_tasks,
+ .cb_barrier = rcu_barrier_tasks,
+ .fqs = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "tasks"
+};
+
+#define RCUTORTURE_TASKS_OPS &tasks_ops,
+
+#else /* #ifdef CONFIG_TASKS_RCU */
+
+#define RCUTORTURE_TASKS_OPS
+
+#endif /* #else #ifdef CONFIG_TASKS_RCU */
+
/*
* RCU torture priority-boost testing. Runs one real-time thread per
* CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -667,7 +724,7 @@ static int rcu_torture_boost(void *arg)
}
call_rcu_time = jiffies;
}
- cond_resched();
+ cond_resched_rcu_qs();
stutter_wait("rcu_torture_boost");
if (torture_must_stop())
goto checkwait;
@@ -707,6 +764,59 @@ checkwait: stutter_wait("rcu_torture_boost");
return 0;
}
+static void rcu_torture_cbflood_cb(struct rcu_head *rhp)
+{
+}
+
+/*
+ * RCU torture callback-flood kthread. Repeatedly induces bursts of calls
+ * to call_rcu() or analogous, increasing the probability of occurrence
+ * of callback-overflow corner cases.
+ */
+static int
+rcu_torture_cbflood(void *arg)
+{
+ int err = 1;
+ int i;
+ int j;
+ struct rcu_head *rhp;
+
+ if (cbflood_n_per_burst > 0 &&
+ cbflood_inter_holdoff > 0 &&
+ cbflood_intra_holdoff > 0 &&
+ cur_ops->call &&
+ cur_ops->cb_barrier) {
+ rhp = vmalloc(sizeof(*rhp) *
+ cbflood_n_burst * cbflood_n_per_burst);
+ err = !rhp;
+ }
+ if (err) {
+ VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
+ while (!torture_must_stop())
+ schedule_timeout_interruptible(HZ);
+ return 0;
+ }
+ VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
+ do {
+ schedule_timeout_interruptible(cbflood_inter_holdoff);
+ atomic_long_inc(&n_cbfloods);
+ WARN_ON(signal_pending(current));
+ for (i = 0; i < cbflood_n_burst; i++) {
+ for (j = 0; j < cbflood_n_per_burst; j++) {
+ cur_ops->call(&rhp[i * cbflood_n_per_burst + j],
+ rcu_torture_cbflood_cb);
+ }
+ schedule_timeout_interruptible(cbflood_intra_holdoff);
+ WARN_ON(signal_pending(current));
+ }
+ cur_ops->cb_barrier();
+ stutter_wait("rcu_torture_cbflood");
+ } while (!torture_must_stop());
+ vfree(rhp);
+ torture_kthread_stopping("rcu_torture_cbflood");
+ return 0;
+}
+
/*
* RCU torture force-quiescent-state kthread. Repeatedly induces
* bursts of calls to force_quiescent_state(), increasing the probability
@@ -1019,7 +1129,7 @@ rcu_torture_reader(void *arg)
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
cur_ops->readunlock(idx);
- cond_resched();
+ cond_resched_rcu_qs();
stutter_wait("rcu_torture_reader");
} while (!torture_must_stop());
if (irqreader && cur_ops->irq_capable) {
@@ -1031,10 +1141,15 @@ rcu_torture_reader(void *arg)
}
/*
- * Create an RCU-torture statistics message in the specified buffer.
+ * Print torture statistics. Caller must ensure that there is only
+ * one call to this function at a given time!!! This is normally
+ * accomplished by relying on the module system to only have one copy
+ * of the module loaded, and then by giving the rcu_torture_stats
+ * kthread full control (or the init/cleanup functions when rcu_torture_stats
+ * thread is not running).
*/
static void
-rcu_torture_printk(char *page)
+rcu_torture_stats_print(void)
{
int cpu;
int i;
@@ -1052,55 +1167,61 @@ rcu_torture_printk(char *page)
if (pipesummary[i] != 0)
break;
}
- page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
- page += sprintf(page,
- "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
- rcu_torture_current,
- rcu_torture_current_version,
- list_empty(&rcu_torture_freelist),
- atomic_read(&n_rcu_torture_alloc),
- atomic_read(&n_rcu_torture_alloc_fail),
- atomic_read(&n_rcu_torture_free));
- page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ",
- atomic_read(&n_rcu_torture_mberror),
- n_rcu_torture_boost_ktrerror,
- n_rcu_torture_boost_rterror);
- page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ",
- n_rcu_torture_boost_failure,
- n_rcu_torture_boosts,
- n_rcu_torture_timers);
- page = torture_onoff_stats(page);
- page += sprintf(page, "barrier: %ld/%ld:%ld",
- n_barrier_successes,
- n_barrier_attempts,
- n_rcu_torture_barrier_error);
- page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+
+ pr_alert("%s%s ", torture_type, TORTURE_FLAG);
+ pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
+ rcu_torture_current,
+ rcu_torture_current_version,
+ list_empty(&rcu_torture_freelist),
+ atomic_read(&n_rcu_torture_alloc),
+ atomic_read(&n_rcu_torture_alloc_fail),
+ atomic_read(&n_rcu_torture_free));
+ pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ",
+ atomic_read(&n_rcu_torture_mberror),
+ n_rcu_torture_boost_ktrerror,
+ n_rcu_torture_boost_rterror);
+ pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
+ n_rcu_torture_boost_failure,
+ n_rcu_torture_boosts,
+ n_rcu_torture_timers);
+ torture_onoff_stats();
+ pr_cont("barrier: %ld/%ld:%ld ",
+ n_barrier_successes,
+ n_barrier_attempts,
+ n_rcu_torture_barrier_error);
+ pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods));
+
+ pr_alert("%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) != 0 ||
n_rcu_torture_barrier_error != 0 ||
n_rcu_torture_boost_ktrerror != 0 ||
n_rcu_torture_boost_rterror != 0 ||
n_rcu_torture_boost_failure != 0 ||
i > 1) {
- page += sprintf(page, "!!! ");
+ pr_cont("%s", "!!! ");
atomic_inc(&n_rcu_torture_error);
WARN_ON_ONCE(1);
}
- page += sprintf(page, "Reader Pipe: ");
+ pr_cont("Reader Pipe: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
- page += sprintf(page, " %ld", pipesummary[i]);
- page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
- page += sprintf(page, "Reader Batch: ");
+ pr_cont(" %ld", pipesummary[i]);
+ pr_cont("\n");
+
+ pr_alert("%s%s ", torture_type, TORTURE_FLAG);
+ pr_cont("Reader Batch: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
- page += sprintf(page, " %ld", batchsummary[i]);
- page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
- page += sprintf(page, "Free-Block Circulation: ");
+ pr_cont(" %ld", batchsummary[i]);
+ pr_cont("\n");
+
+ pr_alert("%s%s ", torture_type, TORTURE_FLAG);
+ pr_cont("Free-Block Circulation: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
- page += sprintf(page, " %d",
- atomic_read(&rcu_torture_wcount[i]));
+ pr_cont(" %d", atomic_read(&rcu_torture_wcount[i]));
}
- page += sprintf(page, "\n");
+ pr_cont("\n");
+
if (cur_ops->stats)
- cur_ops->stats(page);
+ cur_ops->stats();
if (rtcv_snap == rcu_torture_current_version &&
rcu_torture_current != NULL) {
int __maybe_unused flags;
@@ -1109,10 +1230,9 @@ rcu_torture_printk(char *page)
rcutorture_get_gp_data(cur_ops->ttype,
&flags, &gpnum, &completed);
- page += sprintf(page,
- "??? Writer stall state %d g%lu c%lu f%#x\n",
- rcu_torture_writer_state,
- gpnum, completed, flags);
+ pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n",
+ rcu_torture_writer_state,
+ gpnum, completed, flags);
show_rcu_gp_kthreads();
rcutorture_trace_dump();
}
@@ -1120,30 +1240,6 @@ rcu_torture_printk(char *page)
}
/*
- * Print torture statistics. Caller must ensure that there is only
- * one call to this function at a given time!!! This is normally
- * accomplished by relying on the module system to only have one copy
- * of the module loaded, and then by giving the rcu_torture_stats
- * kthread full control (or the init/cleanup functions when rcu_torture_stats
- * thread is not running).
- */
-static void
-rcu_torture_stats_print(void)
-{
- int size = nr_cpu_ids * 200 + 8192;
- char *buf;
-
- buf = kmalloc(size, GFP_KERNEL);
- if (!buf) {
- pr_err("rcu-torture: Out of memory, need: %d", size);
- return;
- }
- rcu_torture_printk(buf);
- pr_alert("%s", buf);
- kfree(buf);
-}
-
-/*
* Periodically prints torture statistics, if periodic statistics printing
* was specified via the stat_interval module parameter.
*/
@@ -1295,7 +1391,8 @@ static int rcu_torture_barrier_cbs(void *arg)
if (atomic_dec_and_test(&barrier_cbs_count))
wake_up(&barrier_wq);
} while (!torture_must_stop());
- cur_ops->cb_barrier();
+ if (cur_ops->cb_barrier != NULL)
+ cur_ops->cb_barrier();
destroy_rcu_head_on_stack(&rcu);
torture_kthread_stopping("rcu_torture_barrier_cbs");
return 0;
@@ -1418,7 +1515,7 @@ rcu_torture_cleanup(void)
int i;
rcutorture_record_test_transition();
- if (torture_cleanup()) {
+ if (torture_cleanup_begin()) {
if (cur_ops->cb_barrier != NULL)
cur_ops->cb_barrier();
return;
@@ -1447,6 +1544,8 @@ rcu_torture_cleanup(void)
torture_stop_kthread(rcu_torture_stats, stats_task);
torture_stop_kthread(rcu_torture_fqs, fqs_task);
+ for (i = 0; i < ncbflooders; i++)
+ torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]);
if ((test_boost == 1 && cur_ops->can_boost) ||
test_boost == 2) {
unregister_cpu_notifier(&rcutorture_cpu_nb);
@@ -1468,6 +1567,7 @@ rcu_torture_cleanup(void)
"End of test: RCU_HOTPLUG");
else
rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
+ torture_cleanup_end();
}
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
@@ -1534,9 +1634,10 @@ rcu_torture_init(void)
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] = {
&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
+ RCUTORTURE_TASKS_OPS
};
- if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable))
+ if (!torture_init_begin(torture_type, verbose, &torture_runnable))
return -EBUSY;
/* Process args and tell the world that the torturer is on the job. */
@@ -1693,6 +1794,24 @@ rcu_torture_init(void)
goto unwind;
if (object_debug)
rcu_test_debug_objects();
+ if (cbflood_n_burst > 0) {
+ /* Create the cbflood threads */
+ ncbflooders = (num_online_cpus() + 3) / 4;
+ cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task),
+ GFP_KERNEL);
+ if (!cbflood_task) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < ncbflooders; i++) {
+ firsterr = torture_create_kthread(rcu_torture_cbflood,
+ NULL,
+ cbflood_task[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ }
rcutorture_record_test_transition();
torture_init_end();
return 0;
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index d9efcc13008c..0db5649f8817 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -51,7 +51,7 @@ static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
#include "tiny_plugin.h"
-/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
+/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */
static void rcu_idle_enter_common(long long newval)
{
if (newval) {
@@ -62,7 +62,7 @@ static void rcu_idle_enter_common(long long newval)
}
RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
rcu_dynticks_nesting, newval));
- if (!is_idle_task(current)) {
+ if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
@@ -72,7 +72,7 @@ static void rcu_idle_enter_common(long long newval)
current->pid, current->comm,
idle->pid, idle->comm); /* must be idle task! */
}
- rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
+ rcu_sched_qs(); /* implies rcu_bh_inc() */
barrier();
rcu_dynticks_nesting = newval;
}
@@ -114,7 +114,7 @@ void rcu_irq_exit(void)
}
EXPORT_SYMBOL_GPL(rcu_irq_exit);
-/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
+/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */
static void rcu_idle_exit_common(long long oldval)
{
if (oldval) {
@@ -123,7 +123,7 @@ static void rcu_idle_exit_common(long long oldval)
return;
}
RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
- if (!is_idle_task(current)) {
+ if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
@@ -217,7 +217,7 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
* are at it, given that any rcu quiescent state is also an rcu_bh
* quiescent state. Use "+" instead of "||" to defeat short circuiting.
*/
-void rcu_sched_qs(int cpu)
+void rcu_sched_qs(void)
{
unsigned long flags;
@@ -231,7 +231,7 @@ void rcu_sched_qs(int cpu)
/*
* Record an rcu_bh quiescent state.
*/
-void rcu_bh_qs(int cpu)
+void rcu_bh_qs(void)
{
unsigned long flags;
@@ -247,13 +247,15 @@ void rcu_bh_qs(int cpu)
* be called from hardirq context. It is normally called from the
* scheduling-clock interrupt.
*/
-void rcu_check_callbacks(int cpu, int user)
+void rcu_check_callbacks(int user)
{
RCU_TRACE(check_cpu_stalls());
if (user || rcu_is_cpu_rrupt_from_idle())
- rcu_sched_qs(cpu);
+ rcu_sched_qs();
else if (!in_softirq())
- rcu_bh_qs(cpu);
+ rcu_bh_qs();
+ if (user)
+ rcu_note_voluntary_context_switch(current);
}
/*
@@ -378,7 +380,9 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
-void rcu_init(void)
+void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+
+ rcu_early_boot_tests();
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1b70cb6fbe3c..7680fc275036 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -79,9 +79,18 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
* the tracing userspace tools to be able to decipher the string
* address to the matching string.
*/
-#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
+#ifdef CONFIG_TRACING
+# define DEFINE_RCU_TPS(sname) \
static char sname##_varname[] = #sname; \
-static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \
+static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname;
+# define RCU_STATE_NAME(sname) sname##_varname
+#else
+# define DEFINE_RCU_TPS(sname)
+# define RCU_STATE_NAME(sname) __stringify(sname)
+#endif
+
+#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
+DEFINE_RCU_TPS(sname) \
struct rcu_state sname##_state = { \
.level = { &sname##_state.node[0] }, \
.call = cr, \
@@ -93,10 +102,10 @@ struct rcu_state sname##_state = { \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
- .name = sname##_varname, \
+ .name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
}; \
-DEFINE_PER_CPU(struct rcu_data, sname##_data)
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data)
RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
@@ -143,19 +152,6 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
*/
static int rcu_scheduler_fully_active __read_mostly;
-#ifdef CONFIG_RCU_BOOST
-
-/*
- * Control variables for per-CPU and per-rcu_node kthreads. These
- * handle all flavors of RCU.
- */
-static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DEFINE_PER_CPU(char, rcu_cpu_has_work);
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -188,22 +184,24 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
* one since the start of the grace period, this just sets a flag.
* The caller must have disabled preemption.
*/
-void rcu_sched_qs(int cpu)
+void rcu_sched_qs(void)
{
- struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
-
- if (rdp->passed_quiesce == 0)
- trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs"));
- rdp->passed_quiesce = 1;
+ if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) {
+ trace_rcu_grace_period(TPS("rcu_sched"),
+ __this_cpu_read(rcu_sched_data.gpnum),
+ TPS("cpuqs"));
+ __this_cpu_write(rcu_sched_data.passed_quiesce, 1);
+ }
}
-void rcu_bh_qs(int cpu)
+void rcu_bh_qs(void)
{
- struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-
- if (rdp->passed_quiesce == 0)
- trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs"));
- rdp->passed_quiesce = 1;
+ if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) {
+ trace_rcu_grace_period(TPS("rcu_bh"),
+ __this_cpu_read(rcu_bh_data.gpnum),
+ TPS("cpuqs"));
+ __this_cpu_write(rcu_bh_data.passed_quiesce, 1);
+ }
}
static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
@@ -275,11 +273,11 @@ static void rcu_momentary_dyntick_idle(void)
* and requires special handling for preemptible RCU.
* The caller must have disabled preemption.
*/
-void rcu_note_context_switch(int cpu)
+void rcu_note_context_switch(void)
{
trace_rcu_utilization(TPS("Start context switch"));
- rcu_sched_qs(cpu);
- rcu_preempt_note_context_switch(cpu);
+ rcu_sched_qs();
+ rcu_preempt_note_context_switch();
if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
rcu_momentary_dyntick_idle();
trace_rcu_utilization(TPS("End context switch"));
@@ -314,7 +312,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
unsigned long *maxj),
bool *isidle, unsigned long *maxj);
static void force_quiescent_state(struct rcu_state *rsp);
-static int rcu_pending(int cpu);
+static int rcu_pending(void);
/*
* Return the number of RCU-sched batches processed thus far for debug & stats.
@@ -499,11 +497,11 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
* we really have entered idle, and must do the appropriate accounting.
* The caller must have disabled interrupts.
*/
-static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
- bool user)
+static void rcu_eqs_enter_common(long long oldval, bool user)
{
struct rcu_state *rsp;
struct rcu_data *rdp;
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
if (!user && !is_idle_task(current)) {
@@ -520,12 +518,13 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
rdp = this_cpu_ptr(rsp->rda);
do_nocb_deferred_wakeup(rdp);
}
- rcu_prepare_for_idle(smp_processor_id());
+ rcu_prepare_for_idle();
/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
smp_mb__before_atomic(); /* See above. */
atomic_inc(&rdtp->dynticks);
smp_mb__after_atomic(); /* Force ordering with next sojourn. */
WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+ rcu_dynticks_task_enter();
/*
* It is illegal to enter an extended quiescent state while
@@ -553,7 +552,7 @@ static void rcu_eqs_enter(bool user)
WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
rdtp->dynticks_nesting = 0;
- rcu_eqs_enter_common(rdtp, oldval, user);
+ rcu_eqs_enter_common(oldval, user);
} else {
rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
}
@@ -577,7 +576,7 @@ void rcu_idle_enter(void)
local_irq_save(flags);
rcu_eqs_enter(false);
- rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0);
+ rcu_sysidle_enter(0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -627,8 +626,8 @@ void rcu_irq_exit(void)
if (rdtp->dynticks_nesting)
trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
else
- rcu_eqs_enter_common(rdtp, oldval, true);
- rcu_sysidle_enter(rdtp, 1);
+ rcu_eqs_enter_common(oldval, true);
+ rcu_sysidle_enter(1);
local_irq_restore(flags);
}
@@ -639,15 +638,17 @@ void rcu_irq_exit(void)
* we really have exited idle, and must do the appropriate accounting.
* The caller must have disabled interrupts.
*/
-static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
- int user)
+static void rcu_eqs_exit_common(long long oldval, int user)
{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ rcu_dynticks_task_exit();
smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
atomic_inc(&rdtp->dynticks);
/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
smp_mb__after_atomic(); /* See above. */
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
- rcu_cleanup_after_idle(smp_processor_id());
+ rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
if (!user && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused =
@@ -678,7 +679,7 @@ static void rcu_eqs_exit(bool user)
rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
} else {
rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_eqs_exit_common(rdtp, oldval, user);
+ rcu_eqs_exit_common(oldval, user);
}
}
@@ -699,7 +700,7 @@ void rcu_idle_exit(void)
local_irq_save(flags);
rcu_eqs_exit(false);
- rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0);
+ rcu_sysidle_exit(0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -750,8 +751,8 @@ void rcu_irq_enter(void)
if (oldval)
trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
else
- rcu_eqs_exit_common(rdtp, oldval, true);
- rcu_sysidle_exit(rdtp, 1);
+ rcu_eqs_exit_common(oldval, true);
+ rcu_sysidle_exit(1);
local_irq_restore(flags);
}
@@ -819,7 +820,7 @@ bool notrace __rcu_is_watching(void)
*/
bool notrace rcu_is_watching(void)
{
- int ret;
+ bool ret;
preempt_disable();
ret = __rcu_is_watching();
@@ -1647,7 +1648,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
- cond_resched();
+ cond_resched_rcu_qs();
}
mutex_unlock(&rsp->onoff_mutex);
@@ -1668,7 +1669,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
if (fqs_state == RCU_SAVE_DYNTICK) {
/* Collect dyntick-idle snapshots. */
if (is_sysidle_rcu_state(rsp)) {
- isidle = 1;
+ isidle = true;
maxj = jiffies - ULONG_MAX / 4;
}
force_qs_rnp(rsp, dyntick_save_progress_counter,
@@ -1677,14 +1678,15 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
fqs_state = RCU_FORCE_QS;
} else {
/* Handle dyntick-idle and offline CPUs. */
- isidle = 0;
+ isidle = false;
force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
}
/* Clear flag to prevent immediate re-entry. */
if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
- ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS;
+ ACCESS_ONCE(rsp->gp_flags) =
+ ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS;
raw_spin_unlock_irq(&rnp->lock);
}
return fqs_state;
@@ -1736,7 +1738,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
/* smp_mb() provided by prior unlock-lock pair. */
nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
- cond_resched();
+ cond_resched_rcu_qs();
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq(&rnp->lock);
@@ -1785,8 +1787,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
break;
- cond_resched();
- flush_signals(current);
+ cond_resched_rcu_qs();
+ WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("reqwaitsig"));
@@ -1828,11 +1830,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("fqsend"));
- cond_resched();
+ cond_resched_rcu_qs();
} else {
/* Deal with stray signal. */
- cond_resched();
- flush_signals(current);
+ cond_resched_rcu_qs();
+ WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("fqswaitsig"));
@@ -1928,7 +1930,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
{
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
- wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
+ rcu_gp_kthread_wake(rsp);
}
/*
@@ -2210,8 +2212,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
- /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
-
/* Exclude any attempts to start a new grace period. */
mutex_lock(&rsp->onoff_mutex);
raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
@@ -2375,7 +2375,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
* invoked from the scheduling-clock interrupt. If rcu_pending returns
* false, there is no point in invoking rcu_check_callbacks().
*/
-void rcu_check_callbacks(int cpu, int user)
+void rcu_check_callbacks(int user)
{
trace_rcu_utilization(TPS("Start scheduler-tick"));
increment_cpu_stall_ticks();
@@ -2393,8 +2393,8 @@ void rcu_check_callbacks(int cpu, int user)
* at least not while the corresponding CPU is online.
*/
- rcu_sched_qs(cpu);
- rcu_bh_qs(cpu);
+ rcu_sched_qs();
+ rcu_bh_qs();
} else if (!in_softirq()) {
@@ -2405,11 +2405,13 @@ void rcu_check_callbacks(int cpu, int user)
* critical section, so note it.
*/
- rcu_bh_qs(cpu);
+ rcu_bh_qs();
}
- rcu_preempt_check_callbacks(cpu);
- if (rcu_pending(cpu))
+ rcu_preempt_check_callbacks();
+ if (rcu_pending())
invoke_rcu_core();
+ if (user)
+ rcu_note_voluntary_context_switch(current);
trace_rcu_utilization(TPS("End scheduler-tick"));
}
@@ -2432,7 +2434,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
struct rcu_node *rnp;
rcu_for_each_leaf_node(rsp, rnp) {
- cond_resched();
+ cond_resched_rcu_qs();
mask = 0;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
@@ -2449,7 +2451,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
if ((rnp->qsmask & bit) != 0) {
if ((rnp->qsmaskinit & bit) != 0)
- *isidle = 0;
+ *isidle = false;
if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
mask |= bit;
}
@@ -2505,9 +2507,10 @@ static void force_quiescent_state(struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
return; /* Someone beat us to it. */
}
- ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS;
+ ACCESS_ONCE(rsp->gp_flags) =
+ ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS;
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
- wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
+ rcu_gp_kthread_wake(rsp);
}
/*
@@ -2925,11 +2928,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
* restructure your code to batch your updates, and then use a single
* synchronize_sched() instead.
*
- * Note that it is illegal to call this function while holding any lock
- * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
- * to call this function from a CPU-hotplug notifier. Failing to observe
- * these restriction will result in deadlock.
- *
* This implementation can be thought of as an application of ticket
* locking to RCU, with sync_sched_expedited_started and
* sync_sched_expedited_done taking on the roles of the halves
@@ -2953,6 +2951,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
*/
void synchronize_sched_expedited(void)
{
+ cpumask_var_t cm;
+ bool cma = false;
+ int cpu;
long firstsnap, s, snap;
int trycount = 0;
struct rcu_state *rsp = &rcu_sched_state;
@@ -2979,14 +2980,34 @@ void synchronize_sched_expedited(void)
*/
snap = atomic_long_inc_return(&rsp->expedited_start);
firstsnap = snap;
- get_online_cpus();
+ if (!try_get_online_cpus()) {
+ /* CPU hotplug operation in flight, fall back to normal GP. */
+ wait_rcu_gp(call_rcu_sched);
+ atomic_long_inc(&rsp->expedited_normal);
+ return;
+ }
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
+ /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
+ cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
+ if (cma) {
+ cpumask_copy(cm, cpu_online_mask);
+ cpumask_clear_cpu(raw_smp_processor_id(), cm);
+ for_each_cpu(cpu, cm) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ cpumask_clear_cpu(cpu, cm);
+ }
+ if (cpumask_weight(cm) == 0)
+ goto all_cpus_idle;
+ }
+
/*
* Each pass through the following loop attempts to force a
* context switch on each CPU.
*/
- while (try_stop_cpus(cpu_online_mask,
+ while (try_stop_cpus(cma ? cm : cpu_online_mask,
synchronize_sched_expedited_cpu_stop,
NULL) == -EAGAIN) {
put_online_cpus();
@@ -2998,6 +3019,7 @@ void synchronize_sched_expedited(void)
/* ensure test happens before caller kfree */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone1);
+ free_cpumask_var(cm);
return;
}
@@ -3007,6 +3029,7 @@ void synchronize_sched_expedited(void)
} else {
wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_normal);
+ free_cpumask_var(cm);
return;
}
@@ -3016,6 +3039,7 @@ void synchronize_sched_expedited(void)
/* ensure test happens before caller kfree */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone2);
+ free_cpumask_var(cm);
return;
}
@@ -3026,12 +3050,21 @@ void synchronize_sched_expedited(void)
* and they started after our first try, so their grace
* period works for us.
*/
- get_online_cpus();
+ if (!try_get_online_cpus()) {
+ /* CPU hotplug operation in flight, use normal GP. */
+ wait_rcu_gp(call_rcu_sched);
+ atomic_long_inc(&rsp->expedited_normal);
+ free_cpumask_var(cm);
+ return;
+ }
snap = atomic_long_read(&rsp->expedited_start);
smp_mb(); /* ensure read is before try_stop_cpus(). */
}
atomic_long_inc(&rsp->expedited_stoppedcpus);
+all_cpus_idle:
+ free_cpumask_var(cm);
+
/*
* Everyone up to our most recent fetch is covered by our grace
* period. Update the counter, but only if our work is still
@@ -3123,12 +3156,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
* by the current CPU, returning 1 if so. This function is part of the
* RCU implementation; it is -not- an exported member of the RCU API.
*/
-static int rcu_pending(int cpu)
+static int rcu_pending(void)
{
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
+ if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda)))
return 1;
return 0;
}
@@ -3138,7 +3171,7 @@ static int rcu_pending(int cpu)
* non-NULL, store an indication of whether all callbacks are lazy.
* (If there are no callbacks, all of them are deemed to be lazy.)
*/
-static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
+static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
{
bool al = true;
bool hc = false;
@@ -3146,7 +3179,7 @@ static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp) {
- rdp = per_cpu_ptr(rsp->rda, cpu);
+ rdp = this_cpu_ptr(rsp->rda);
if (!rdp->nxtlist)
continue;
hc = true;
@@ -3279,11 +3312,16 @@ static void _rcu_barrier(struct rcu_state *rsp)
continue;
rdp = per_cpu_ptr(rsp->rda, cpu);
if (rcu_is_nocb_cpu(cpu)) {
- _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
- rsp->n_barrier_done);
- atomic_inc(&rsp->barrier_cpu_count);
- __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
- rsp, cpu, 0);
+ if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
+ _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
+ rsp->n_barrier_done);
+ } else {
+ _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+ rsp->n_barrier_done);
+ atomic_inc(&rsp->barrier_cpu_count);
+ __call_rcu(&rdp->barrier_head,
+ rcu_barrier_callback, rsp, cpu, 0);
+ }
} else if (ACCESS_ONCE(rdp->qlen)) {
_rcu_barrier_trace(rsp, "OnlineQ", cpu,
rsp->n_barrier_done);
@@ -3442,6 +3480,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
case CPU_UP_PREPARE_FROZEN:
rcu_prepare_cpu(cpu);
rcu_prepare_kthreads(cpu);
+ rcu_spawn_all_nocb_kthreads(cpu);
break;
case CPU_ONLINE:
case CPU_DOWN_FAILED:
@@ -3459,8 +3498,10 @@ static int rcu_cpu_notify(struct notifier_block *self,
case CPU_DEAD_FROZEN:
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
- for_each_rcu_flavor(rsp)
+ for_each_rcu_flavor(rsp) {
rcu_cleanup_dead_cpu(cpu, rsp);
+ do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
+ }
break;
default:
break;
@@ -3489,7 +3530,7 @@ static int rcu_pm_notify(struct notifier_block *self,
}
/*
- * Spawn the kthread that handles this RCU flavor's grace periods.
+ * Spawn the kthreads that handle each RCU flavor's grace periods.
*/
static int __init rcu_spawn_gp_kthread(void)
{
@@ -3498,6 +3539,7 @@ static int __init rcu_spawn_gp_kthread(void)
struct rcu_state *rsp;
struct task_struct *t;
+ rcu_scheduler_fully_active = 1;
for_each_rcu_flavor(rsp) {
t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
BUG_ON(IS_ERR(t));
@@ -3505,8 +3547,9 @@ static int __init rcu_spawn_gp_kthread(void)
raw_spin_lock_irqsave(&rnp->lock, flags);
rsp->gp_kthread = t;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- rcu_spawn_nocb_kthreads(rsp);
}
+ rcu_spawn_nocb_kthreads();
+ rcu_spawn_boost_kthreads();
return 0;
}
early_initcall(rcu_spawn_gp_kthread);
@@ -3738,6 +3781,8 @@ void __init rcu_init(void)
pm_notifier(rcu_pm_notify, 0);
for_each_online_cpu(cpu)
rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+
+ rcu_early_boot_tests();
}
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 6a86eb7bac45..8e7b1843896e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -139,7 +139,7 @@ struct rcu_node {
unsigned long expmask; /* Groups that have ->blkd_tasks */
/* elements that need to drain to allow the */
/* current expedited grace period to */
- /* complete (only for TREE_PREEMPT_RCU). */
+ /* complete (only for PREEMPT_RCU). */
unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -350,7 +350,7 @@ struct rcu_data {
int nocb_p_count_lazy; /* (approximate). */
wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
- bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
+ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
/* The following fields are used by the leader, hence own cacheline. */
struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
@@ -383,6 +383,11 @@ struct rcu_data {
#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
+/* Values for nocb_defer_wakeup field in struct rcu_data. */
+#define RCU_NOGP_WAKE_NOT 0
+#define RCU_NOGP_WAKE 1
+#define RCU_NOGP_WAKE_FORCE 2
+
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
/* and jiffies_till_next_fqs. */
@@ -525,10 +530,10 @@ DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
extern struct rcu_state rcu_bh_state;
DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
extern struct rcu_state rcu_preempt_state;
DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -542,7 +547,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
long rcu_batches_completed(void);
-static void rcu_preempt_note_context_switch(int cpu);
+static void rcu_preempt_note_context_switch(void);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -556,12 +561,12 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
struct rcu_node *rnp,
struct rcu_data *rdp);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_preempt_check_callbacks(int cpu);
+static void rcu_preempt_check_callbacks(void);
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake);
-#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
+#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
static void __init __rcu_init_preempt(void);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -572,15 +577,17 @@ static void rcu_preempt_do_callbacks(void);
static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct rcu_node *rnp);
#endif /* #ifdef CONFIG_RCU_BOOST */
+static void __init rcu_spawn_boost_kthreads(void);
static void rcu_prepare_kthreads(int cpu);
-static void rcu_cleanup_after_idle(int cpu);
-static void rcu_prepare_for_idle(int cpu);
+static void rcu_cleanup_after_idle(void);
+static void rcu_prepare_for_idle(void);
static void rcu_idle_count_callbacks_posted(void);
static void print_cpu_stall_info_begin(void);
static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
static void print_cpu_stall_info_end(void);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
static void rcu_init_one_nocb(struct rcu_node *rnp);
@@ -589,14 +596,18 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
struct rcu_data *rdp,
unsigned long flags);
-static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
-static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
+static void rcu_spawn_all_nocb_kthreads(int cpu);
+static void __init rcu_spawn_nocb_kthreads(void);
+#ifdef CONFIG_RCU_NOCB_CPU
+static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp);
-static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
-static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_enter(int irq);
+static void rcu_sysidle_exit(int irq);
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
unsigned long *maxj);
static bool is_sysidle_rcu_state(struct rcu_state *rsp);
@@ -605,6 +616,8 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
static void rcu_bind_gp_kthread(void);
static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
+static void rcu_dynticks_task_enter(void);
+static void rcu_dynticks_task_exit(void);
#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index a7997e272564..3ec85cb5d544 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -30,14 +30,24 @@
#include <linux/smpboot.h>
#include "../time/tick-internal.h"
-#define RCU_KTHREAD_PRIO 1
-
#ifdef CONFIG_RCU_BOOST
+
#include "../locking/rtmutex_common.h"
-#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
-#else
-#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
-#endif
+
+/* rcuc/rcub kthread realtime priority */
+static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+module_param(kthread_prio, int, 0644);
+
+/*
+ * Control variables for per-CPU and per-rcu_node kthreads. These
+ * handle all flavors of RCU.
+ */
+static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DEFINE_PER_CPU(char, rcu_cpu_has_work);
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -72,9 +82,6 @@ static void __init rcu_bootup_announce_oddness(void)
#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
pr_info("\tRCU torture testing starts during boot.\n");
#endif
-#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
- pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n");
-#endif
#if defined(CONFIG_RCU_CPU_STALL_INFO)
pr_info("\tAdditional per-CPU info printed with stalls.\n");
#endif
@@ -85,36 +92,12 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
-#ifdef CONFIG_RCU_NOCB_CPU
-#ifndef CONFIG_RCU_NOCB_CPU_NONE
- if (!have_rcu_nocb_mask) {
- zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL);
- have_rcu_nocb_mask = true;
- }
-#ifdef CONFIG_RCU_NOCB_CPU_ZERO
- pr_info("\tOffload RCU callbacks from CPU 0\n");
- cpumask_set_cpu(0, rcu_nocb_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
-#ifdef CONFIG_RCU_NOCB_CPU_ALL
- pr_info("\tOffload RCU callbacks from all CPUs\n");
- cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
- if (have_rcu_nocb_mask) {
- if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
- pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
- cpumask_and(rcu_nocb_mask, cpu_possible_mask,
- rcu_nocb_mask);
- }
- cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
- pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
- if (rcu_nocb_poll)
- pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
- }
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+#ifdef CONFIG_RCU_BOOST
+ pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
+#endif
}
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
static struct rcu_state *rcu_state_p = &rcu_preempt_state;
@@ -134,7 +117,7 @@ static void __init rcu_bootup_announce(void)
* Return the number of RCU-preempt batches processed thus far
* for debug and statistics.
*/
-long rcu_batches_completed_preempt(void)
+static long rcu_batches_completed_preempt(void)
{
return rcu_preempt_state.completed;
}
@@ -155,18 +138,19 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
* not in a quiescent state. There might be any number of tasks blocked
* while in an RCU read-side critical section.
*
- * Unlike the other rcu_*_qs() functions, callers to this function
- * must disable irqs in order to protect the assignment to
- * ->rcu_read_unlock_special.
- */
-static void rcu_preempt_qs(int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
-
- if (rdp->passed_quiesce == 0)
- trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs"));
- rdp->passed_quiesce = 1;
- current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+ * As with the other rcu_*_qs() functions, callers to this function
+ * must disable preemption.
+ */
+static void rcu_preempt_qs(void)
+{
+ if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
+ trace_rcu_grace_period(TPS("rcu_preempt"),
+ __this_cpu_read(rcu_preempt_data.gpnum),
+ TPS("cpuqs"));
+ __this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
+ barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
+ current->rcu_read_unlock_special.b.need_qs = false;
+ }
}
/*
@@ -182,7 +166,7 @@ static void rcu_preempt_qs(int cpu)
*
* Caller must disable preemption.
*/
-static void rcu_preempt_note_context_switch(int cpu)
+static void rcu_preempt_note_context_switch(void)
{
struct task_struct *t = current;
unsigned long flags;
@@ -190,14 +174,14 @@ static void rcu_preempt_note_context_switch(int cpu)
struct rcu_node *rnp;
if (t->rcu_read_lock_nesting > 0 &&
- (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
+ !t->rcu_read_unlock_special.b.blocked) {
/* Possibly blocking in an RCU read-side critical section. */
- rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
+ rdp = this_cpu_ptr(rcu_preempt_state.rda);
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
+ t->rcu_read_unlock_special.b.blocked = true;
t->rcu_blocked_node = rnp;
/*
@@ -239,7 +223,7 @@ static void rcu_preempt_note_context_switch(int cpu)
: rnp->gpnum + 1);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
} else if (t->rcu_read_lock_nesting < 0 &&
- t->rcu_read_unlock_special) {
+ t->rcu_read_unlock_special.s) {
/*
* Complete exit from RCU read-side critical section on
@@ -257,9 +241,7 @@ static void rcu_preempt_note_context_switch(int cpu)
* grace period, then the fact that the task has been enqueued
* means that we continue to block the current grace period.
*/
- local_irq_save(flags);
- rcu_preempt_qs(cpu);
- local_irq_restore(flags);
+ rcu_preempt_qs();
}
/*
@@ -340,7 +322,7 @@ void rcu_read_unlock_special(struct task_struct *t)
bool drop_boost_mutex = false;
#endif /* #ifdef CONFIG_RCU_BOOST */
struct rcu_node *rnp;
- int special;
+ union rcu_special special;
/* NMI handlers cannot block and cannot safely manipulate state. */
if (in_nmi())
@@ -350,12 +332,13 @@ void rcu_read_unlock_special(struct task_struct *t)
/*
* If RCU core is waiting for this CPU to exit critical section,
- * let it know that we have done so.
+ * let it know that we have done so. Because irqs are disabled,
+ * t->rcu_read_unlock_special cannot change.
*/
special = t->rcu_read_unlock_special;
- if (special & RCU_READ_UNLOCK_NEED_QS) {
- rcu_preempt_qs(smp_processor_id());
- if (!t->rcu_read_unlock_special) {
+ if (special.b.need_qs) {
+ rcu_preempt_qs();
+ if (!t->rcu_read_unlock_special.s) {
local_irq_restore(flags);
return;
}
@@ -368,8 +351,8 @@ void rcu_read_unlock_special(struct task_struct *t)
}
/* Clean up if blocked during RCU read-side critical section. */
- if (special & RCU_READ_UNLOCK_BLOCKED) {
- t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
+ if (special.b.blocked) {
+ t->rcu_read_unlock_special.b.blocked = false;
/*
* Remove this task from the list it blocked on. The
@@ -442,8 +425,6 @@ void rcu_read_unlock_special(struct task_struct *t)
}
}
-#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
-
/*
* Dump detailed information for all tasks blocking the current RCU
* grace period on the specified rcu_node structure.
@@ -478,14 +459,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
rcu_print_detail_task_stall_rnp(rnp);
}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
-
-static void rcu_print_detail_task_stall(struct rcu_state *rsp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
-
#ifdef CONFIG_RCU_CPU_STALL_INFO
static void rcu_print_task_stall_begin(struct rcu_node *rnp)
@@ -648,17 +621,18 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
*
* Caller must disable hard irqs.
*/
-static void rcu_preempt_check_callbacks(int cpu)
+static void rcu_preempt_check_callbacks(void)
{
struct task_struct *t = current;
if (t->rcu_read_lock_nesting == 0) {
- rcu_preempt_qs(cpu);
+ rcu_preempt_qs();
return;
}
if (t->rcu_read_lock_nesting > 0 &&
- per_cpu(rcu_preempt_data, cpu).qs_pending)
- t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
+ __this_cpu_read(rcu_preempt_data.qs_pending) &&
+ !__this_cpu_read(rcu_preempt_data.passed_quiesce))
+ t->rcu_read_unlock_special.b.need_qs = true;
}
#ifdef CONFIG_RCU_BOOST
@@ -819,11 +793,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
* In fact, if you are using synchronize_rcu_expedited() in a loop,
* please restructure your code to batch your updates, and then Use a
* single synchronize_rcu() instead.
- *
- * Note that it is illegal to call this function while holding any lock
- * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
- * to call this function from a CPU-hotplug notifier. Failing to observe
- * these restriction will result in deadlock.
*/
void synchronize_rcu_expedited(void)
{
@@ -845,7 +814,11 @@ void synchronize_rcu_expedited(void)
* being boosted. This simplifies the process of moving tasks
* from leaf to root rcu_node structures.
*/
- get_online_cpus();
+ if (!try_get_online_cpus()) {
+ /* CPU-hotplug operation in flight, fall back to normal GP. */
+ wait_rcu_gp(call_rcu);
+ return;
+ }
/*
* Acquire lock, falling back to synchronize_rcu() if too many
@@ -897,7 +870,8 @@ void synchronize_rcu_expedited(void)
/* Clean up and exit. */
smp_mb(); /* ensure expedited GP seen before counter increment. */
- ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
+ ACCESS_ONCE(sync_rcu_preempt_exp_count) =
+ sync_rcu_preempt_exp_count + 1;
unlock_mb_ret:
mutex_unlock(&sync_rcu_preempt_exp_mutex);
mb_ret:
@@ -941,11 +915,11 @@ void exit_rcu(void)
return;
t->rcu_read_lock_nesting = 1;
barrier();
- t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
+ t->rcu_read_unlock_special.b.blocked = true;
__rcu_read_unlock();
}
-#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#else /* #ifdef CONFIG_PREEMPT_RCU */
static struct rcu_state *rcu_state_p = &rcu_sched_state;
@@ -971,7 +945,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
* Because preemptible RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
-static void rcu_preempt_note_context_switch(int cpu)
+static void rcu_preempt_note_context_switch(void)
{
}
@@ -1043,7 +1017,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
* Because preemptible RCU does not exist, it never has any callbacks
* to check.
*/
-static void rcu_preempt_check_callbacks(int cpu)
+static void rcu_preempt_check_callbacks(void)
{
}
@@ -1096,7 +1070,7 @@ void exit_rcu(void)
{
}
-#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
@@ -1352,7 +1326,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
smp_mb__after_unlock_lock();
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- sp.sched_priority = RCU_BOOST_PRIO;
+ sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
return 0;
@@ -1369,7 +1343,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)
{
struct sched_param sp;
- sp.sched_priority = RCU_KTHREAD_PRIO;
+ sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
}
@@ -1462,14 +1436,13 @@ static struct smp_hotplug_thread rcu_cpu_thread_spec = {
};
/*
- * Spawn all kthreads -- called as soon as the scheduler is running.
+ * Spawn boost kthreads -- called as soon as the scheduler is running.
*/
-static int __init rcu_spawn_kthreads(void)
+static void __init rcu_spawn_boost_kthreads(void)
{
struct rcu_node *rnp;
int cpu;
- rcu_scheduler_fully_active = 1;
for_each_possible_cpu(cpu)
per_cpu(rcu_cpu_has_work, cpu) = 0;
BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
@@ -1479,9 +1452,7 @@ static int __init rcu_spawn_kthreads(void)
rcu_for_each_leaf_node(rcu_state_p, rnp)
(void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
}
- return 0;
}
-early_initcall(rcu_spawn_kthreads);
static void rcu_prepare_kthreads(int cpu)
{
@@ -1519,12 +1490,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
}
-static int __init rcu_scheduler_really_started(void)
+static void __init rcu_spawn_boost_kthreads(void)
{
- rcu_scheduler_fully_active = 1;
- return 0;
}
-early_initcall(rcu_scheduler_really_started);
static void rcu_prepare_kthreads(int cpu)
{
@@ -1544,10 +1512,10 @@ static void rcu_prepare_kthreads(int cpu)
* any flavor of RCU.
*/
#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+int rcu_needs_cpu(unsigned long *delta_jiffies)
{
*delta_jiffies = ULONG_MAX;
- return rcu_cpu_has_callbacks(cpu, NULL);
+ return rcu_cpu_has_callbacks(NULL);
}
#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
@@ -1555,7 +1523,7 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
* Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
* after it.
*/
-static void rcu_cleanup_after_idle(int cpu)
+static void rcu_cleanup_after_idle(void)
{
}
@@ -1563,7 +1531,7 @@ static void rcu_cleanup_after_idle(int cpu)
* Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
* is nothing.
*/
-static void rcu_prepare_for_idle(int cpu)
+static void rcu_prepare_for_idle(void)
{
}
@@ -1625,7 +1593,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
/* Exit early if we advanced recently. */
if (jiffies == rdtp->last_advance_all)
- return 0;
+ return false;
rdtp->last_advance_all = jiffies;
for_each_rcu_flavor(rsp) {
@@ -1656,15 +1624,15 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
* The caller must have disabled interrupts.
*/
#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(int cpu, unsigned long *dj)
+int rcu_needs_cpu(unsigned long *dj)
{
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
/* If no callbacks, RCU doesn't need the CPU. */
- if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
+ if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
*dj = ULONG_MAX;
return 0;
}
@@ -1698,12 +1666,12 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
*
* The caller must have disabled interrupts.
*/
-static void rcu_prepare_for_idle(int cpu)
+static void rcu_prepare_for_idle(void)
{
#ifndef CONFIG_RCU_NOCB_CPU_ALL
bool needwake;
struct rcu_data *rdp;
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
struct rcu_node *rnp;
struct rcu_state *rsp;
int tne;
@@ -1711,7 +1679,7 @@ static void rcu_prepare_for_idle(int cpu)
/* Handle nohz enablement switches conservatively. */
tne = ACCESS_ONCE(tick_nohz_active);
if (tne != rdtp->tick_nohz_enabled_snap) {
- if (rcu_cpu_has_callbacks(cpu, NULL))
+ if (rcu_cpu_has_callbacks(NULL))
invoke_rcu_core(); /* force nohz to see update. */
rdtp->tick_nohz_enabled_snap = tne;
return;
@@ -1720,7 +1688,7 @@ static void rcu_prepare_for_idle(int cpu)
return;
/* If this is a no-CBs CPU, no callbacks, just return. */
- if (rcu_is_nocb_cpu(cpu))
+ if (rcu_is_nocb_cpu(smp_processor_id()))
return;
/*
@@ -1744,7 +1712,7 @@ static void rcu_prepare_for_idle(int cpu)
return;
rdtp->last_accelerate = jiffies;
for_each_rcu_flavor(rsp) {
- rdp = per_cpu_ptr(rsp->rda, cpu);
+ rdp = this_cpu_ptr(rsp->rda);
if (!*rdp->nxttail[RCU_DONE_TAIL])
continue;
rnp = rdp->mynode;
@@ -1763,10 +1731,10 @@ static void rcu_prepare_for_idle(int cpu)
* any grace periods that elapsed while the CPU was idle, and if any
* callbacks are now ready to invoke, initiate invocation.
*/
-static void rcu_cleanup_after_idle(int cpu)
+static void rcu_cleanup_after_idle(void)
{
#ifndef CONFIG_RCU_NOCB_CPU_ALL
- if (rcu_is_nocb_cpu(cpu))
+ if (rcu_is_nocb_cpu(smp_processor_id()))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
@@ -1848,7 +1816,7 @@ static int rcu_oom_notify(struct notifier_block *self,
get_online_cpus();
for_each_online_cpu(cpu) {
smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
- cond_resched();
+ cond_resched_rcu_qs();
}
put_online_cpus();
@@ -2075,13 +2043,40 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
return;
if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
- /* Prior xchg orders against prior callback enqueue. */
+ /* Prior smp_mb__after_atomic() orders against prior enqueue. */
ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
wake_up(&rdp_leader->nocb_wq);
}
}
/*
+ * Does the specified CPU need an RCU callback for the specified flavor
+ * of rcu_barrier()?
+ */
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_head *rhp;
+
+ /* No-CBs CPUs might have callbacks on any of three lists. */
+ rhp = ACCESS_ONCE(rdp->nocb_head);
+ if (!rhp)
+ rhp = ACCESS_ONCE(rdp->nocb_gp_head);
+ if (!rhp)
+ rhp = ACCESS_ONCE(rdp->nocb_follower_head);
+
+ /* Having no rcuo kthread but CBs after scheduler starts is bad! */
+ if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) {
+ /* RCU callback enqueued before CPU first came online??? */
+ pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
+ cpu, rhp->func);
+ WARN_ON_ONCE(1);
+ }
+
+ return !!rhp;
+}
+
+/*
* Enqueue the specified string of rcu_head structures onto the specified
* CPU's no-CBs lists. The CPU is specified by rdp, the head of the
* string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
@@ -2104,6 +2099,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
ACCESS_ONCE(*old_rhpp) = rhp;
atomic_long_add(rhcount, &rdp->nocb_q_count);
atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
+ smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
/* If we are not being polled and there is a kthread, awaken it ... */
t = ACCESS_ONCE(rdp->nocb_kthread);
@@ -2120,16 +2116,23 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- rdp->nocb_defer_wakeup = true;
+ rdp->nocb_defer_wakeup = RCU_NOGP_WAKE;
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmptyIsDeferred"));
}
rdp->qlen_last_fqs_check = 0;
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
/* ... or if many callbacks queued. */
- wake_nocb_leader(rdp, true);
+ if (!irqs_disabled_flags(flags)) {
+ wake_nocb_leader(rdp, true);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WakeOvf"));
+ } else {
+ rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE;
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WakeOvfIsDeferred"));
+ }
rdp->qlen_last_fqs_check = LONG_MAX / 2;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
} else {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
}
@@ -2150,7 +2153,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
{
if (!rcu_is_nocb_cpu(rdp->cpu))
- return 0;
+ return false;
__call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
if (__is_kfree_rcu_offset((unsigned long)rhp->func))
trace_rcu_kfree_callback(rdp->rsp->name, rhp,
@@ -2161,7 +2164,18 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
trace_rcu_callback(rdp->rsp->name, rhp,
-atomic_long_read(&rdp->nocb_q_count_lazy),
-atomic_long_read(&rdp->nocb_q_count));
- return 1;
+
+ /*
+ * If called from an extended quiescent state with interrupts
+ * disabled, invoke the RCU core in order to allow the idle-entry
+ * deferred-wakeup check to function.
+ */
+ if (irqs_disabled_flags(flags) &&
+ !rcu_is_watching() &&
+ cpu_online(smp_processor_id()))
+ invoke_rcu_core();
+
+ return true;
}
/*
@@ -2177,7 +2191,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
/* If this is not a no-CBs CPU, tell the caller to do it the old way. */
if (!rcu_is_nocb_cpu(smp_processor_id()))
- return 0;
+ return false;
rsp->qlen = 0;
rsp->qlen_lazy = 0;
@@ -2196,7 +2210,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
rsp->orphan_nxtlist = NULL;
rsp->orphan_nxttail = &rsp->orphan_nxtlist;
}
- return 1;
+ return true;
}
/*
@@ -2229,7 +2243,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
(d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
if (likely(d))
break;
- flush_signals(current);
+ WARN_ON(signal_pending(current));
trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
}
trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
@@ -2288,7 +2302,7 @@ wait_again:
if (!rcu_nocb_poll)
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
"WokeEmpty");
- flush_signals(current);
+ WARN_ON(signal_pending(current));
schedule_timeout_interruptible(1);
/* Rescan in case we were a victim of memory ordering. */
@@ -2327,6 +2341,7 @@ wait_again:
atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
atomic_long_add(rdp->nocb_gp_count_lazy,
&rdp->nocb_follower_count_lazy);
+ smp_mb__after_atomic(); /* Store *tail before wakeup. */
if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
/*
* List was empty, wake up the follower.
@@ -2367,7 +2382,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
if (!rcu_nocb_poll)
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
"WokeEmpty");
- flush_signals(current);
+ WARN_ON(signal_pending(current));
schedule_timeout_interruptible(1);
}
}
@@ -2428,15 +2443,16 @@ static int rcu_nocb_kthread(void *arg)
list = next;
}
trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
- ACCESS_ONCE(rdp->nocb_p_count) -= c;
- ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl;
+ ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c;
+ ACCESS_ONCE(rdp->nocb_p_count_lazy) =
+ rdp->nocb_p_count_lazy - cl;
rdp->n_nocbs_invoked += c;
}
return 0;
}
/* Is a deferred wakeup of rcu_nocb_kthread() required? */
-static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
{
return ACCESS_ONCE(rdp->nocb_defer_wakeup);
}
@@ -2444,11 +2460,79 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
/* Do a deferred wakeup of rcu_nocb_kthread(). */
static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
+ int ndw;
+
if (!rcu_nocb_need_deferred_wakeup(rdp))
return;
- ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
- wake_nocb_leader(rdp, false);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
+ ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
+ ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
+ wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
+}
+
+void __init rcu_init_nohz(void)
+{
+ int cpu;
+ bool need_rcu_nocb_mask = true;
+ struct rcu_state *rsp;
+
+#ifdef CONFIG_RCU_NOCB_CPU_NONE
+ need_rcu_nocb_mask = false;
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
+
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
+ need_rcu_nocb_mask = true;
+#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+
+ if (!have_rcu_nocb_mask && need_rcu_nocb_mask) {
+ if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
+ pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
+ return;
+ }
+ have_rcu_nocb_mask = true;
+ }
+ if (!have_rcu_nocb_mask)
+ return;
+
+#ifdef CONFIG_RCU_NOCB_CPU_ZERO
+ pr_info("\tOffload RCU callbacks from CPU 0\n");
+ cpumask_set_cpu(0, rcu_nocb_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
+#ifdef CONFIG_RCU_NOCB_CPU_ALL
+ pr_info("\tOffload RCU callbacks from all CPUs\n");
+ cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running)
+ cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
+#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+
+ if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
+ pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
+ cpumask_and(rcu_nocb_mask, cpu_possible_mask,
+ rcu_nocb_mask);
+ }
+ cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
+ pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
+ if (rcu_nocb_poll)
+ pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
+
+ for_each_rcu_flavor(rsp) {
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+
+ /*
+ * If there are early callbacks, they will need
+ * to be moved to the nocb lists.
+ */
+ WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] !=
+ &rdp->nxtlist &&
+ rdp->nxttail[RCU_NEXT_TAIL] != NULL);
+ init_nocb_callback_list(rdp);
+ }
+ rcu_organize_nocb_kthreads(rsp);
+ }
}
/* Initialize per-rcu_data variables for no-CBs CPUs. */
@@ -2459,15 +2543,89 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
rdp->nocb_follower_tail = &rdp->nocb_follower_head;
}
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo kthread for the specified RCU flavor, spawn it. If the CPUs are
+ * brought online out of order, this can require re-organizing the
+ * leader-follower relationships.
+ */
+static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
+{
+ struct rcu_data *rdp;
+ struct rcu_data *rdp_last;
+ struct rcu_data *rdp_old_leader;
+ struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu);
+ struct task_struct *t;
+
+ /*
+ * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
+ * then nothing to do.
+ */
+ if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread)
+ return;
+
+ /* If we didn't spawn the leader first, reorganize! */
+ rdp_old_leader = rdp_spawn->nocb_leader;
+ if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) {
+ rdp_last = NULL;
+ rdp = rdp_old_leader;
+ do {
+ rdp->nocb_leader = rdp_spawn;
+ if (rdp_last && rdp != rdp_spawn)
+ rdp_last->nocb_next_follower = rdp;
+ if (rdp == rdp_spawn) {
+ rdp = rdp->nocb_next_follower;
+ } else {
+ rdp_last = rdp;
+ rdp = rdp->nocb_next_follower;
+ rdp_last->nocb_next_follower = NULL;
+ }
+ } while (rdp);
+ rdp_spawn->nocb_next_follower = rdp_old_leader;
+ }
+
+ /* Spawn the kthread for this CPU and RCU flavor. */
+ t = kthread_run(rcu_nocb_kthread, rdp_spawn,
+ "rcuo%c/%d", rsp->abbr, cpu);
+ BUG_ON(IS_ERR(t));
+ ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
+}
+
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo kthreads, spawn them.
+ */
+static void rcu_spawn_all_nocb_kthreads(int cpu)
+{
+ struct rcu_state *rsp;
+
+ if (rcu_scheduler_fully_active)
+ for_each_rcu_flavor(rsp)
+ rcu_spawn_one_nocb_kthread(rsp, cpu);
+}
+
+/*
+ * Once the scheduler is running, spawn rcuo kthreads for all online
+ * no-CBs CPUs. This assumes that the early_initcall()s happen before
+ * non-boot CPUs come online -- if this changes, we will need to add
+ * some mutual exclusion.
+ */
+static void __init rcu_spawn_nocb_kthreads(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ rcu_spawn_all_nocb_kthreads(cpu);
+}
+
/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
static int rcu_nocb_leader_stride = -1;
module_param(rcu_nocb_leader_stride, int, 0444);
/*
- * Create a kthread for each RCU flavor for each no-CBs CPU.
- * Also initialize leader-follower relationships.
+ * Initialize leader-follower relationships for all no-CBs CPU.
*/
-static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
+static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
{
int cpu;
int ls = rcu_nocb_leader_stride;
@@ -2475,14 +2633,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
struct rcu_data *rdp;
struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
struct rcu_data *rdp_prev = NULL;
- struct task_struct *t;
- if (rcu_nocb_mask == NULL)
+ if (!have_rcu_nocb_mask)
return;
-#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL)
- if (tick_nohz_full_running)
- cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
-#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */
if (ls == -1) {
ls = int_sqrt(nr_cpu_ids);
rcu_nocb_leader_stride = ls;
@@ -2505,27 +2658,27 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
rdp_prev->nocb_next_follower = rdp;
}
rdp_prev = rdp;
-
- /* Spawn the kthread for this CPU. */
- t = kthread_run(rcu_nocb_kthread, rdp,
- "rcuo%c/%d", rsp->abbr, cpu);
- BUG_ON(IS_ERR(t));
- ACCESS_ONCE(rdp->nocb_kthread) = t;
}
}
/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
static bool init_nocb_callback_list(struct rcu_data *rdp)
{
- if (rcu_nocb_mask == NULL ||
- !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
+ if (!rcu_is_nocb_cpu(rdp->cpu))
return false;
+
rdp->nxttail[RCU_NEXT_TAIL] = NULL;
return true;
}
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
+{
+ WARN_ON_ONCE(1); /* Should be dead code. */
+ return false;
+}
+
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
}
@@ -2541,21 +2694,21 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy, unsigned long flags)
{
- return 0;
+ return false;
}
static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
struct rcu_data *rdp,
unsigned long flags)
{
- return 0;
+ return false;
}
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
}
-static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
{
return false;
}
@@ -2564,7 +2717,11 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
}
-static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
+static void rcu_spawn_all_nocb_kthreads(int cpu)
+{
+}
+
+static void __init rcu_spawn_nocb_kthreads(void)
{
}
@@ -2595,16 +2752,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-/*
- * Define RCU flavor that holds sysidle state. This needs to be the
- * most active flavor of RCU.
- */
-#ifdef CONFIG_PREEMPT_RCU
-static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
-#else /* #ifdef CONFIG_PREEMPT_RCU */
-static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
-#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
-
static int full_sysidle_state; /* Current system-idle state. */
#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
@@ -2618,9 +2765,14 @@ static int full_sysidle_state; /* Current system-idle state. */
* to detect full-system idle states, not RCU quiescent states and grace
* periods. The caller must have disabled interrupts.
*/
-static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_enter(int irq)
{
unsigned long j;
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* If there are no nohz_full= CPUs, no need to track this. */
+ if (!tick_nohz_full_enabled())
+ return;
/* Adjust nesting, check for fully idle. */
if (irq) {
@@ -2685,8 +2837,14 @@ void rcu_sysidle_force_exit(void)
* usermode execution does -not- count as idle here! The caller must
* have disabled interrupts.
*/
-static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_exit(int irq)
{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* If there are no nohz_full= CPUs, no need to track this. */
+ if (!tick_nohz_full_enabled())
+ return;
+
/* Adjust nesting, check for already non-idle. */
if (irq) {
rdtp->dynticks_idle_nesting++;
@@ -2741,12 +2899,16 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
unsigned long j;
struct rcu_dynticks *rdtp = rdp->dynticks;
+ /* If there are no nohz_full= CPUs, don't check system-wide idleness. */
+ if (!tick_nohz_full_enabled())
+ return;
+
/*
* If some other CPU has already reported non-idle, if this is
* not the flavor of RCU that tracks sysidle state, or if this
* is an offline or the timekeeping CPU, nothing to do.
*/
- if (!*isidle || rdp->rsp != rcu_sysidle_state ||
+ if (!*isidle || rdp->rsp != rcu_state_p ||
cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
return;
if (rcu_gp_in_progress(rdp->rsp))
@@ -2772,7 +2934,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
*/
static bool is_sysidle_rcu_state(struct rcu_state *rsp)
{
- return rsp == rcu_sysidle_state;
+ return rsp == rcu_state_p;
}
/*
@@ -2850,7 +3012,7 @@ static void rcu_sysidle_cancel(void)
static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
unsigned long maxj, bool gpkt)
{
- if (rsp != rcu_sysidle_state)
+ if (rsp != rcu_state_p)
return; /* Wrong flavor, ignore. */
if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
return; /* Running state machine from timekeeping CPU. */
@@ -2867,6 +3029,10 @@ static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
unsigned long maxj)
{
+ /* If there are no nohz_full= CPUs, no need to track this. */
+ if (!tick_nohz_full_enabled())
+ return;
+
rcu_sysidle_report(rsp, isidle, maxj, true);
}
@@ -2893,7 +3059,8 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
/*
* Check to see if the system is fully idle, other than the timekeeping CPU.
- * The caller must have disabled interrupts.
+ * The caller must have disabled interrupts. This is not intended to be
+ * called unless tick_nohz_full_enabled().
*/
bool rcu_sys_is_idle(void)
{
@@ -2919,13 +3086,12 @@ bool rcu_sys_is_idle(void)
/* Scan all the CPUs looking for nonidle CPUs. */
for_each_possible_cpu(cpu) {
- rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
+ rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
if (!isidle)
break;
}
- rcu_sysidle_report(rcu_sysidle_state,
- isidle, maxj, false);
+ rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
oldrss = rss;
rss = ACCESS_ONCE(full_sysidle_state);
}
@@ -2952,7 +3118,7 @@ bool rcu_sys_is_idle(void)
* provided by the memory allocator.
*/
if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
- !rcu_gp_in_progress(rcu_sysidle_state) &&
+ !rcu_gp_in_progress(rcu_state_p) &&
!rsh.inuse && xchg(&rsh.inuse, 1) == 0)
call_rcu(&rsh.rh, rcu_sysidle_cb);
return false;
@@ -2968,11 +3134,11 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_enter(int irq)
{
}
-static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_exit(int irq)
{
}
@@ -3036,3 +3202,19 @@ static void rcu_bind_gp_kthread(void)
housekeeping_affine(current);
#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
}
+
+/* Record the current task on dyntick-idle entry. */
+static void rcu_dynticks_task_enter(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Record no current task on dyntick-idle exit. */
+static void rcu_dynticks_task_exit(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 4056d7992a6c..e0d31a345ee6 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -47,6 +47,8 @@
#include <linux/hardirq.h>
#include <linux/delay.h>
#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/tick.h>
#define CREATE_TRACE_POINTS
@@ -91,7 +93,7 @@ void __rcu_read_unlock(void)
barrier(); /* critical section before exit code. */
t->rcu_read_lock_nesting = INT_MIN;
barrier(); /* assign before ->rcu_read_unlock_special load */
- if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s)))
rcu_read_unlock_special(t);
barrier(); /* ->rcu_read_unlock_special load before assign */
t->rcu_read_lock_nesting = 0;
@@ -137,6 +139,38 @@ int notrace debug_lockdep_rcu_enabled(void)
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
/**
+ * rcu_read_lock_held() - might we be in RCU read-side critical section?
+ *
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU
+ * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC,
+ * this assumes we are in an RCU read-side critical section unless it can
+ * prove otherwise. This is useful for debug checks in functions that
+ * require that they be called within an RCU read-side critical section.
+ *
+ * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * and while lockdep is disabled.
+ *
+ * Note that rcu_read_lock() and the matching rcu_read_unlock() must
+ * occur in the same context, for example, it is illegal to invoke
+ * rcu_read_unlock() in process context if the matching rcu_read_lock()
+ * was invoked from within an irq handler.
+ *
+ * Note that rcu_read_lock() is disallowed if the CPU is either idle or
+ * offline from an RCU perspective, so check for those as well.
+ */
+int rcu_read_lock_held(void)
+{
+ if (!debug_lockdep_rcu_enabled())
+ return 1;
+ if (!rcu_is_watching())
+ return 0;
+ if (!rcu_lockdep_current_cpu_online())
+ return 0;
+ return lock_is_held(&rcu_lock_map);
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_held);
+
+/**
* rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
*
* Check for bottom half being disabled, which covers both the
@@ -272,7 +306,7 @@ struct debug_obj_descr rcuhead_debug_descr = {
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
unsigned long secs,
unsigned long c_old, unsigned long c)
@@ -347,3 +381,397 @@ static int __init check_cpu_stall_init(void)
early_initcall(check_cpu_stall_init);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
+ * Simple variant of RCU whose quiescent states are voluntary context switch,
+ * user-space execution, and idle. As such, grace periods can take one good
+ * long time. There are no read-side primitives similar to rcu_read_lock()
+ * and rcu_read_unlock() because this implementation is intended to get
+ * the system into a safe state for some of the manipulations involved in
+ * tracing and the like. Finally, this implementation does not support
+ * high call_rcu_tasks() rates from multiple CPUs. If this is required,
+ * per-CPU callback lists will be needed.
+ */
+
+/* Global list of callbacks and associated lock. */
+static struct rcu_head *rcu_tasks_cbs_head;
+static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
+static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
+
+/* Track exiting tasks in order to allow them to be waited for. */
+DEFINE_SRCU(tasks_rcu_exit_srcu);
+
+/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
+static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
+module_param(rcu_task_stall_timeout, int, 0644);
+
+static void rcu_spawn_tasks_kthread(void);
+
+/*
+ * Post an RCU-tasks callback. First call must be from process context
+ * after the scheduler if fully operational.
+ */
+void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp))
+{
+ unsigned long flags;
+ bool needwake;
+
+ rhp->next = NULL;
+ rhp->func = func;
+ raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
+ needwake = !rcu_tasks_cbs_head;
+ *rcu_tasks_cbs_tail = rhp;
+ rcu_tasks_cbs_tail = &rhp->next;
+ raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
+ if (needwake) {
+ rcu_spawn_tasks_kthread();
+ wake_up(&rcu_tasks_cbs_wq);
+ }
+}
+EXPORT_SYMBOL_GPL(call_rcu_tasks);
+
+/**
+ * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu-tasks
+ * grace period has elapsed, in other words after all currently
+ * executing rcu-tasks read-side critical sections have elapsed. These
+ * read-side critical sections are delimited by calls to schedule(),
+ * cond_resched_rcu_qs(), idle execution, userspace execution, calls
+ * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
+ *
+ * This is a very specialized primitive, intended only for a few uses in
+ * tracing and other situations requiring manipulation of function
+ * preambles and profiling hooks. The synchronize_rcu_tasks() function
+ * is not (yet) intended for heavy use from multiple CPUs.
+ *
+ * Note that this guarantee implies further memory-ordering guarantees.
+ * On systems with more than one CPU, when synchronize_rcu_tasks() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since the
+ * end of its last RCU-tasks read-side critical section whose beginning
+ * preceded the call to synchronize_rcu_tasks(). In addition, each CPU
+ * having an RCU-tasks read-side critical section that extends beyond
+ * the return from synchronize_rcu_tasks() is guaranteed to have executed
+ * a full memory barrier after the beginning of synchronize_rcu_tasks()
+ * and before the beginning of that RCU-tasks read-side critical section.
+ * Note that these guarantees include CPUs that are offline, idle, or
+ * executing in user mode, as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU
+ * (but again only if the system has more than one CPU).
+ */
+void synchronize_rcu_tasks(void)
+{
+ /* Complain if the scheduler has not started. */
+ rcu_lockdep_assert(!rcu_scheduler_active,
+ "synchronize_rcu_tasks called too soon");
+
+ /* Wait for the grace period. */
+ wait_rcu_gp(call_rcu_tasks);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);
+
+/**
+ * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks.
+ *
+ * Although the current implementation is guaranteed to wait, it is not
+ * obligated to, for example, if there are no pending callbacks.
+ */
+void rcu_barrier_tasks(void)
+{
+ /* There is only one callback queue, so this is easy. ;-) */
+ synchronize_rcu_tasks();
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
+
+/* See if tasks are still holding out, complain if so. */
+static void check_holdout_task(struct task_struct *t,
+ bool needreport, bool *firstreport)
+{
+ int cpu;
+
+ if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
+ t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
+ !ACCESS_ONCE(t->on_rq) ||
+ (IS_ENABLED(CONFIG_NO_HZ_FULL) &&
+ !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
+ ACCESS_ONCE(t->rcu_tasks_holdout) = false;
+ list_del_init(&t->rcu_tasks_holdout_list);
+ put_task_struct(t);
+ return;
+ }
+ if (!needreport)
+ return;
+ if (*firstreport) {
+ pr_err("INFO: rcu_tasks detected stalls on tasks:\n");
+ *firstreport = false;
+ }
+ cpu = task_cpu(t);
+ pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n",
+ t, ".I"[is_idle_task(t)],
+ "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)],
+ t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout,
+ t->rcu_tasks_idle_cpu, cpu);
+ sched_show_task(t);
+}
+
+/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
+static int __noreturn rcu_tasks_kthread(void *arg)
+{
+ unsigned long flags;
+ struct task_struct *g, *t;
+ unsigned long lastreport;
+ struct rcu_head *list;
+ struct rcu_head *next;
+ LIST_HEAD(rcu_tasks_holdouts);
+
+ /* Run on housekeeping CPUs by default. Sysadm can move if desired. */
+ housekeeping_affine(current);
+
+ /*
+ * Each pass through the following loop makes one check for
+ * newly arrived callbacks, and, if there are some, waits for
+ * one RCU-tasks grace period and then invokes the callbacks.
+ * This loop is terminated by the system going down. ;-)
+ */
+ for (;;) {
+
+ /* Pick up any new callbacks. */
+ raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
+ list = rcu_tasks_cbs_head;
+ rcu_tasks_cbs_head = NULL;
+ rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
+ raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
+
+ /* If there were none, wait a bit and start over. */
+ if (!list) {
+ wait_event_interruptible(rcu_tasks_cbs_wq,
+ rcu_tasks_cbs_head);
+ if (!rcu_tasks_cbs_head) {
+ WARN_ON(signal_pending(current));
+ schedule_timeout_interruptible(HZ/10);
+ }
+ continue;
+ }
+
+ /*
+ * Wait for all pre-existing t->on_rq and t->nvcsw
+ * transitions to complete. Invoking synchronize_sched()
+ * suffices because all these transitions occur with
+ * interrupts disabled. Without this synchronize_sched(),
+ * a read-side critical section that started before the
+ * grace period might be incorrectly seen as having started
+ * after the grace period.
+ *
+ * This synchronize_sched() also dispenses with the
+ * need for a memory barrier on the first store to
+ * ->rcu_tasks_holdout, as it forces the store to happen
+ * after the beginning of the grace period.
+ */
+ synchronize_sched();
+
+ /*
+ * There were callbacks, so we need to wait for an
+ * RCU-tasks grace period. Start off by scanning
+ * the task list for tasks that are not already
+ * voluntarily blocked. Mark these tasks and make
+ * a list of them in rcu_tasks_holdouts.
+ */
+ rcu_read_lock();
+ for_each_process_thread(g, t) {
+ if (t != current && ACCESS_ONCE(t->on_rq) &&
+ !is_idle_task(t)) {
+ get_task_struct(t);
+ t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
+ ACCESS_ONCE(t->rcu_tasks_holdout) = true;
+ list_add(&t->rcu_tasks_holdout_list,
+ &rcu_tasks_holdouts);
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * Wait for tasks that are in the process of exiting.
+ * This does only part of the job, ensuring that all
+ * tasks that were previously exiting reach the point
+ * where they have disabled preemption, allowing the
+ * later synchronize_sched() to finish the job.
+ */
+ synchronize_srcu(&tasks_rcu_exit_srcu);
+
+ /*
+ * Each pass through the following loop scans the list
+ * of holdout tasks, removing any that are no longer
+ * holdouts. When the list is empty, we are done.
+ */
+ lastreport = jiffies;
+ while (!list_empty(&rcu_tasks_holdouts)) {
+ bool firstreport;
+ bool needreport;
+ int rtst;
+ struct task_struct *t1;
+
+ schedule_timeout_interruptible(HZ);
+ rtst = ACCESS_ONCE(rcu_task_stall_timeout);
+ needreport = rtst > 0 &&
+ time_after(jiffies, lastreport + rtst);
+ if (needreport)
+ lastreport = jiffies;
+ firstreport = true;
+ WARN_ON(signal_pending(current));
+ list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts,
+ rcu_tasks_holdout_list) {
+ check_holdout_task(t, needreport, &firstreport);
+ cond_resched();
+ }
+ }
+
+ /*
+ * Because ->on_rq and ->nvcsw are not guaranteed
+ * to have a full memory barriers prior to them in the
+ * schedule() path, memory reordering on other CPUs could
+ * cause their RCU-tasks read-side critical sections to
+ * extend past the end of the grace period. However,
+ * because these ->nvcsw updates are carried out with
+ * interrupts disabled, we can use synchronize_sched()
+ * to force the needed ordering on all such CPUs.
+ *
+ * This synchronize_sched() also confines all
+ * ->rcu_tasks_holdout accesses to be within the grace
+ * period, avoiding the need for memory barriers for
+ * ->rcu_tasks_holdout accesses.
+ *
+ * In addition, this synchronize_sched() waits for exiting
+ * tasks to complete their final preempt_disable() region
+ * of execution, cleaning up after the synchronize_srcu()
+ * above.
+ */
+ synchronize_sched();
+
+ /* Invoke the callbacks. */
+ while (list) {
+ next = list->next;
+ local_bh_disable();
+ list->func(list);
+ local_bh_enable();
+ list = next;
+ cond_resched();
+ }
+ schedule_timeout_uninterruptible(HZ/10);
+ }
+}
+
+/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */
+static void rcu_spawn_tasks_kthread(void)
+{
+ static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
+ static struct task_struct *rcu_tasks_kthread_ptr;
+ struct task_struct *t;
+
+ if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) {
+ smp_mb(); /* Ensure caller sees full kthread. */
+ return;
+ }
+ mutex_lock(&rcu_tasks_kthread_mutex);
+ if (rcu_tasks_kthread_ptr) {
+ mutex_unlock(&rcu_tasks_kthread_mutex);
+ return;
+ }
+ t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
+ BUG_ON(IS_ERR(t));
+ smp_mb(); /* Ensure others see full kthread. */
+ ACCESS_ONCE(rcu_tasks_kthread_ptr) = t;
+ mutex_unlock(&rcu_tasks_kthread_mutex);
+}
+
+#endif /* #ifdef CONFIG_TASKS_RCU */
+
+#ifdef CONFIG_PROVE_RCU
+
+/*
+ * Early boot self test parameters, one for each flavor
+ */
+static bool rcu_self_test;
+static bool rcu_self_test_bh;
+static bool rcu_self_test_sched;
+
+module_param(rcu_self_test, bool, 0444);
+module_param(rcu_self_test_bh, bool, 0444);
+module_param(rcu_self_test_sched, bool, 0444);
+
+static int rcu_self_test_counter;
+
+static void test_callback(struct rcu_head *r)
+{
+ rcu_self_test_counter++;
+ pr_info("RCU test callback executed %d\n", rcu_self_test_counter);
+}
+
+static void early_boot_test_call_rcu(void)
+{
+ static struct rcu_head head;
+
+ call_rcu(&head, test_callback);
+}
+
+static void early_boot_test_call_rcu_bh(void)
+{
+ static struct rcu_head head;
+
+ call_rcu_bh(&head, test_callback);
+}
+
+static void early_boot_test_call_rcu_sched(void)
+{
+ static struct rcu_head head;
+
+ call_rcu_sched(&head, test_callback);
+}
+
+void rcu_early_boot_tests(void)
+{
+ pr_info("Running RCU self tests\n");
+
+ if (rcu_self_test)
+ early_boot_test_call_rcu();
+ if (rcu_self_test_bh)
+ early_boot_test_call_rcu_bh();
+ if (rcu_self_test_sched)
+ early_boot_test_call_rcu_sched();
+}
+
+static int rcu_verify_early_boot_tests(void)
+{
+ int ret = 0;
+ int early_boot_test_counter = 0;
+
+ if (rcu_self_test) {
+ early_boot_test_counter++;
+ rcu_barrier();
+ }
+ if (rcu_self_test_bh) {
+ early_boot_test_counter++;
+ rcu_barrier_bh();
+ }
+ if (rcu_self_test_sched) {
+ early_boot_test_counter++;
+ rcu_barrier_sched();
+ }
+
+ if (rcu_self_test_counter != early_boot_test_counter) {
+ WARN_ON(1);
+ ret = -1;
+ }
+
+ return ret;
+}
+late_initcall(rcu_verify_early_boot_tests);
+#else
+void rcu_early_boot_tests(void) {}
+#endif /* CONFIG_PROVE_RCU */
diff --git a/kernel/reboot.c b/kernel/reboot.c
index a3a9e240fcdb..5925f5ae8dff 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,87 @@ int unregister_reboot_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_reboot_notifier);
+/*
+ * Notifier list for kernel code which wants to be called
+ * to restart the system.
+ */
+static ATOMIC_NOTIFIER_HEAD(restart_handler_list);
+
+/**
+ * register_restart_handler - Register function to be called to reset
+ * the system
+ * @nb: Info about handler function to be called
+ * @nb->priority: Handler priority. Handlers should follow the
+ * following guidelines for setting priorities.
+ * 0: Restart handler of last resort,
+ * with limited restart capabilities
+ * 128: Default restart handler; use if no other
+ * restart handler is expected to be available,
+ * and/or if restart functionality is
+ * sufficient to restart the entire system
+ * 255: Highest priority restart handler, will
+ * preempt all other restart handlers
+ *
+ * Registers a function with code to be called to restart the
+ * system.
+ *
+ * Registered functions will be called from machine_restart as last
+ * step of the restart sequence (if the architecture specific
+ * machine_restart function calls do_kernel_restart - see below
+ * for details).
+ * Registered functions are expected to restart the system immediately.
+ * If more than one function is registered, the restart handler priority
+ * selects which function will be called first.
+ *
+ * Restart handlers are expected to be registered from non-architecture
+ * code, typically from drivers. A typical use case would be a system
+ * where restart functionality is provided through a watchdog. Multiple
+ * restart handlers may exist; for example, one restart handler might
+ * restart the entire system, while another only restarts the CPU.
+ * In such cases, the restart handler which only restarts part of the
+ * hardware is expected to register with low priority to ensure that
+ * it only runs if no other means to restart the system is available.
+ *
+ * Currently always returns zero, as atomic_notifier_chain_register()
+ * always returns zero.
+ */
+int register_restart_handler(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&restart_handler_list, nb);
+}
+EXPORT_SYMBOL(register_restart_handler);
+
+/**
+ * unregister_restart_handler - Unregister previously registered
+ * restart handler
+ * @nb: Hook to be unregistered
+ *
+ * Unregisters a previously registered restart handler function.
+ *
+ * Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_restart_handler(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&restart_handler_list, nb);
+}
+EXPORT_SYMBOL(unregister_restart_handler);
+
+/**
+ * do_kernel_restart - Execute kernel restart handler call chain
+ *
+ * Calls functions registered with register_restart_handler.
+ *
+ * Expected to be called from machine_restart as last step of the restart
+ * sequence.
+ *
+ * Restarts the system immediately if a restart handler function has been
+ * registered. Otherwise does nothing.
+ */
+void do_kernel_restart(char *cmd)
+{
+ atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd);
+}
+
void migrate_to_reboot_cpu(void)
{
/* The boot cpu is always logical cpu 0 */
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
deleted file mode 100644
index e791130f85a7..000000000000
--- a/kernel/res_counter.c
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * resource cgroups
- *
- * Copyright 2007 OpenVZ SWsoft Inc
- *
- * Author: Pavel Emelianov <xemul@openvz.org>
- *
- */
-
-#include <linux/types.h>
-#include <linux/parser.h>
-#include <linux/fs.h>
-#include <linux/res_counter.h>
-#include <linux/uaccess.h>
-#include <linux/mm.h>
-
-void res_counter_init(struct res_counter *counter, struct res_counter *parent)
-{
- spin_lock_init(&counter->lock);
- counter->limit = RES_COUNTER_MAX;
- counter->soft_limit = RES_COUNTER_MAX;
- counter->parent = parent;
-}
-
-static u64 res_counter_uncharge_locked(struct res_counter *counter,
- unsigned long val)
-{
- if (WARN_ON(counter->usage < val))
- val = counter->usage;
-
- counter->usage -= val;
- return counter->usage;
-}
-
-static int res_counter_charge_locked(struct res_counter *counter,
- unsigned long val, bool force)
-{
- int ret = 0;
-
- if (counter->usage + val > counter->limit) {
- counter->failcnt++;
- ret = -ENOMEM;
- if (!force)
- return ret;
- }
-
- counter->usage += val;
- if (counter->usage > counter->max_usage)
- counter->max_usage = counter->usage;
- return ret;
-}
-
-static int __res_counter_charge(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at, bool force)
-{
- int ret, r;
- unsigned long flags;
- struct res_counter *c, *u;
-
- r = ret = 0;
- *limit_fail_at = NULL;
- local_irq_save(flags);
- for (c = counter; c != NULL; c = c->parent) {
- spin_lock(&c->lock);
- r = res_counter_charge_locked(c, val, force);
- spin_unlock(&c->lock);
- if (r < 0 && !ret) {
- ret = r;
- *limit_fail_at = c;
- if (!force)
- break;
- }
- }
-
- if (ret < 0 && !force) {
- for (u = counter; u != c; u = u->parent) {
- spin_lock(&u->lock);
- res_counter_uncharge_locked(u, val);
- spin_unlock(&u->lock);
- }
- }
- local_irq_restore(flags);
-
- return ret;
-}
-
-int res_counter_charge(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at)
-{
- return __res_counter_charge(counter, val, limit_fail_at, false);
-}
-
-int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at)
-{
- return __res_counter_charge(counter, val, limit_fail_at, true);
-}
-
-u64 res_counter_uncharge_until(struct res_counter *counter,
- struct res_counter *top,
- unsigned long val)
-{
- unsigned long flags;
- struct res_counter *c;
- u64 ret = 0;
-
- local_irq_save(flags);
- for (c = counter; c != top; c = c->parent) {
- u64 r;
- spin_lock(&c->lock);
- r = res_counter_uncharge_locked(c, val);
- if (c == counter)
- ret = r;
- spin_unlock(&c->lock);
- }
- local_irq_restore(flags);
- return ret;
-}
-
-u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
-{
- return res_counter_uncharge_until(counter, NULL, val);
-}
-
-static inline unsigned long long *
-res_counter_member(struct res_counter *counter, int member)
-{
- switch (member) {
- case RES_USAGE:
- return &counter->usage;
- case RES_MAX_USAGE:
- return &counter->max_usage;
- case RES_LIMIT:
- return &counter->limit;
- case RES_FAILCNT:
- return &counter->failcnt;
- case RES_SOFT_LIMIT:
- return &counter->soft_limit;
- };
-
- BUG();
- return NULL;
-}
-
-ssize_t res_counter_read(struct res_counter *counter, int member,
- const char __user *userbuf, size_t nbytes, loff_t *pos,
- int (*read_strategy)(unsigned long long val, char *st_buf))
-{
- unsigned long long *val;
- char buf[64], *s;
-
- s = buf;
- val = res_counter_member(counter, member);
- if (read_strategy)
- s += read_strategy(*val, s);
- else
- s += sprintf(s, "%llu\n", *val);
- return simple_read_from_buffer((void __user *)userbuf, nbytes,
- pos, buf, s - buf);
-}
-
-#if BITS_PER_LONG == 32
-u64 res_counter_read_u64(struct res_counter *counter, int member)
-{
- unsigned long flags;
- u64 ret;
-
- spin_lock_irqsave(&counter->lock, flags);
- ret = *res_counter_member(counter, member);
- spin_unlock_irqrestore(&counter->lock, flags);
-
- return ret;
-}
-#else
-u64 res_counter_read_u64(struct res_counter *counter, int member)
-{
- return *res_counter_member(counter, member);
-}
-#endif
-
-int res_counter_memparse_write_strategy(const char *buf,
- unsigned long long *resp)
-{
- char *end;
- unsigned long long res;
-
- /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
- if (*buf == '-') {
- int rc = kstrtoull(buf + 1, 10, &res);
-
- if (rc)
- return rc;
- if (res != 1)
- return -EINVAL;
- *resp = RES_COUNTER_MAX;
- return 0;
- }
-
- res = memparse(buf, &end);
- if (*end != '\0')
- return -EINVAL;
-
- if (PAGE_ALIGN(res) >= res)
- res = PAGE_ALIGN(res);
- else
- res = RES_COUNTER_MAX;
-
- *resp = res;
-
- return 0;
-}
diff --git a/kernel/resource.c b/kernel/resource.c
index 60c5a3856ab7..0bcebffc4e77 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -491,6 +491,42 @@ int __weak page_is_ram(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(page_is_ram);
+/*
+ * Search for a resouce entry that fully contains the specified region.
+ * If found, return 1 if it is RAM, 0 if not.
+ * If not found, or region is not fully contained, return -1
+ *
+ * Used by the ioremap functions to ensure the user is not remapping RAM and is
+ * a vast speed up over walking through the resource table page by page.
+ */
+int region_is_ram(resource_size_t start, unsigned long size)
+{
+ struct resource *p;
+ resource_size_t end = start + size - 1;
+ int flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ const char *name = "System RAM";
+ int ret = -1;
+
+ read_lock(&resource_lock);
+ for (p = iomem_resource.child; p ; p = p->sibling) {
+ if (end < p->start)
+ continue;
+
+ if (p->start <= start && end <= p->end) {
+ /* resource fully contains region */
+ if ((p->flags != flags) || strcmp(p->name, name))
+ ret = 0;
+ else
+ ret = 1;
+ break;
+ }
+ if (p->end < start)
+ break; /* not found */
+ }
+ read_unlock(&resource_lock);
+ return ret;
+}
+
void __weak arch_remove_reservations(struct resource *avail)
{
}
@@ -1245,6 +1281,76 @@ int release_mem_region_adjustable(struct resource *parent,
/*
* Managed region resource
*/
+static void devm_resource_release(struct device *dev, void *ptr)
+{
+ struct resource **r = ptr;
+
+ release_resource(*r);
+}
+
+/**
+ * devm_request_resource() - request and reserve an I/O or memory resource
+ * @dev: device for which to request the resource
+ * @root: root of the resource tree from which to request the resource
+ * @new: descriptor of the resource to request
+ *
+ * This is a device-managed version of request_resource(). There is usually
+ * no need to release resources requested by this function explicitly since
+ * that will be taken care of when the device is unbound from its driver.
+ * If for some reason the resource needs to be released explicitly, because
+ * of ordering issues for example, drivers must call devm_release_resource()
+ * rather than the regular release_resource().
+ *
+ * When a conflict is detected between any existing resources and the newly
+ * requested resource, an error message will be printed.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+int devm_request_resource(struct device *dev, struct resource *root,
+ struct resource *new)
+{
+ struct resource *conflict, **ptr;
+
+ ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ *ptr = new;
+
+ conflict = request_resource_conflict(root, new);
+ if (conflict) {
+ dev_err(dev, "resource collision: %pR conflicts with %s %pR\n",
+ new, conflict->name, conflict);
+ devres_free(ptr);
+ return -EBUSY;
+ }
+
+ devres_add(dev, ptr);
+ return 0;
+}
+EXPORT_SYMBOL(devm_request_resource);
+
+static int devm_resource_match(struct device *dev, void *res, void *data)
+{
+ struct resource **ptr = res;
+
+ return *ptr == data;
+}
+
+/**
+ * devm_release_resource() - release a previously requested resource
+ * @dev: device for which to release the resource
+ * @new: descriptor of the resource to release
+ *
+ * Releases a resource previously requested using devm_request_resource().
+ */
+void devm_release_resource(struct device *dev, struct resource *new)
+{
+ WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match,
+ new));
+}
+EXPORT_SYMBOL(devm_release_resource);
+
struct region_devres {
struct resource *parent;
resource_size_t start;
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e73efba98301..8a2e230fb86a 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -148,11 +148,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
goto out;
- t = p;
- do {
+ for_each_thread(p, t)
sched_move_task(t);
- } while_each_thread(p, t);
-
out:
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 3ef6451e972e..c27e4f8f4879 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -134,7 +134,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
static inline struct sched_clock_data *this_scd(void)
{
- return &__get_cpu_var(sched_clock_data);
+ return this_cpu_ptr(&sched_clock_data);
}
static inline struct sched_clock_data *cpu_sdc(int cpu)
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a63f4dc27909..607f852b4d04 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
*
* This waits to be signaled for completion of a specific task. It is NOT
* interruptible and there is no timeout. The caller is accounted as waiting
- * for IO.
+ * for IO (which traditionally means blkio only).
*/
void __sched wait_for_completion_io(struct completion *x)
{
@@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible. The caller is accounted as waiting for IO.
+ * interruptible. The caller is accounted as waiting for IO (which traditionally
+ * means blkio only).
*
* Return: 0 if timed out, and positive (at least 1, or number of jiffies left
* till timeout) if completed.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec1a286684a5..c0accc00566e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,22 +90,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
-#ifdef smp_mb__before_atomic
-void __smp_mb__before_atomic(void)
-{
- smp_mb__before_atomic();
-}
-EXPORT_SYMBOL(__smp_mb__before_atomic);
-#endif
-
-#ifdef smp_mb__after_atomic
-void __smp_mb__after_atomic(void)
-{
- smp_mb__after_atomic();
-}
-EXPORT_SYMBOL(__smp_mb__after_atomic);
-#endif
-
void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
unsigned long delta;
@@ -333,9 +317,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p)))
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
return rq;
raw_spin_unlock(&rq->lock);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
}
}
@@ -352,10 +339,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
raw_spin_lock_irqsave(&p->pi_lock, *flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p)))
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
return rq;
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
}
}
@@ -449,7 +439,15 @@ static void __hrtick_start(void *arg)
void hrtick_start(struct rq *rq, u64 delay)
{
struct hrtimer *timer = &rq->hrtick_timer;
- ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
+ ktime_t time;
+ s64 delta;
+
+ /*
+ * Don't schedule slices shorter than 10000ns, that just
+ * doesn't make sense and can cause timer DoS.
+ */
+ delta = max_t(s64, delay, 10000LL);
+ time = ktime_add_ns(timer->base->get_time(), delta);
hrtimer_set_expires(timer, time);
@@ -1010,6 +1008,9 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
+/*
+ * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ */
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
int oldprio)
@@ -1017,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
+ /* Possble rq->lock 'hole'. */
p->sched_class->switched_to(rq, p);
} else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
@@ -1043,7 +1045,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
- if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
+ if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
rq->skip_clock_update = 1;
}
@@ -1056,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* ttwu() will sort out the placement.
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
- !(task_preempt_count(p) & PREEMPT_ACTIVE));
+ !p->on_rq);
#ifdef CONFIG_LOCKDEP
/*
@@ -1088,7 +1090,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
- if (p->on_rq) {
+ if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
src_rq = task_rq(p);
@@ -1214,7 +1216,7 @@ static int migration_cpu_stop(void *data);
unsigned long wait_task_inactive(struct task_struct *p, long match_state)
{
unsigned long flags;
- int running, on_rq;
+ int running, queued;
unsigned long ncsw;
struct rq *rq;
@@ -1252,7 +1254,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
rq = task_rq_lock(p, &flags);
trace_sched_wait_task(p);
running = task_running(rq, p);
- on_rq = p->on_rq;
+ queued = task_on_rq_queued(p);
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1284,7 +1286,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
- if (unlikely(on_rq)) {
+ if (unlikely(queued)) {
ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1409,7 +1411,8 @@ out:
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ if (p->nr_cpus_allowed > 1)
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1478,7 +1481,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
activate_task(rq, p, en_flags);
- p->on_rq = 1;
+ p->on_rq = TASK_ON_RQ_QUEUED;
/* if a worker is waking up, notify workqueue */
if (p->flags & PF_WQ_WORKER)
@@ -1537,7 +1540,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
int ret = 0;
rq = __task_rq_lock(p);
- if (p->on_rq) {
+ if (task_on_rq_queued(p)) {
/* check_preempt_curr() may use rq clock */
update_rq_clock(rq);
ttwu_do_wakeup(rq, p, wake_flags);
@@ -1620,6 +1623,30 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
}
}
+void wake_up_if_idle(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ rcu_read_lock();
+
+ if (!is_idle_task(rcu_dereference(rq->curr)))
+ goto out;
+
+ if (set_nr_if_polling(rq->idle)) {
+ trace_sched_wake_idle_without_ipi(cpu);
+ } else {
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (is_idle_task(rq->curr))
+ smp_send_reschedule(cpu);
+ /* Else cpu is not in idle, do nothing here */
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+out:
+ rcu_read_unlock();
+}
+
bool cpus_share_cache(int this_cpu, int that_cpu)
{
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
@@ -1742,7 +1769,7 @@ static void try_to_wake_up_local(struct task_struct *p)
if (!(p->state & TASK_NORMAL))
goto out;
- if (!p->on_rq)
+ if (!task_on_rq_queued(p))
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
ttwu_do_wakeup(rq, p, 0);
@@ -1776,6 +1803,20 @@ int wake_up_state(struct task_struct *p, unsigned int state)
}
/*
+ * This function clears the sched_dl_entity static params.
+ */
+void __dl_clear_params(struct task_struct *p)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ dl_se->dl_runtime = 0;
+ dl_se->dl_deadline = 0;
+ dl_se->dl_period = 0;
+ dl_se->flags = 0;
+ dl_se->dl_bw = 0;
+}
+
+/*
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
*
@@ -1799,10 +1840,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
RB_CLEAR_NODE(&p->dl.rb_node);
hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- p->dl.dl_runtime = p->dl.runtime = 0;
- p->dl.dl_deadline = p->dl.deadline = 0;
- p->dl.dl_period = 0;
- p->dl.flags = 0;
+ __dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
@@ -1825,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
- p->numa_faults_memory = NULL;
- p->numa_faults_buffer_memory = NULL;
+ p->numa_faults = NULL;
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
- INIT_LIST_HEAD(&p->numa_entry);
p->numa_group = NULL;
#endif /* CONFIG_NUMA_BALANCING */
}
@@ -1977,6 +2013,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
#ifdef CONFIG_SMP
inline struct dl_bw *dl_bw_of(int i)
{
+ rcu_lockdep_assert(rcu_read_lock_sched_held(),
+ "sched RCU must be held");
return &cpu_rq(i)->rd->dl_bw;
}
@@ -1985,6 +2023,8 @@ static inline int dl_bw_cpus(int i)
struct root_domain *rd = cpu_rq(i)->rd;
int cpus = 0;
+ rcu_lockdep_assert(rcu_read_lock_sched_held(),
+ "sched RCU must be held");
for_each_cpu_and(i, rd->span, cpu_active_mask)
cpus++;
@@ -2002,25 +2042,6 @@ static inline int dl_bw_cpus(int i)
}
#endif
-static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
-{
- dl_b->total_bw -= tsk_bw;
-}
-
-static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
-{
- dl_b->total_bw += tsk_bw;
-}
-
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
-{
- return dl_b->bw != -1 &&
- dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
-}
-
/*
* We must be sure that accepting a new task (or allowing changing the
* parameters of an existing one) is consistent with the bandwidth
@@ -2095,7 +2116,7 @@ void wake_up_new_task(struct task_struct *p)
init_task_runnable_average(p);
rq = __task_rq_lock(p);
activate_task(rq, p, 0);
- p->on_rq = 1;
+ p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p, true);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
@@ -2188,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
/**
* finish_task_switch - clean up after a task-switch
- * @rq: runqueue associated with task-switch
* @prev: the thread we just switched away from.
*
* finish_task_switch must be called after the context switch, paired
@@ -2200,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
* so, we finish that here outside of the runqueue lock. (Doing it
* with the lock held can cause deadlocks; see schedule() for
* details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
*/
-static void finish_task_switch(struct rq *rq, struct task_struct *prev)
+static struct rq *finish_task_switch(struct task_struct *prev)
__releases(rq->lock)
{
+ struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
long prev_state;
@@ -2243,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
}
tick_nohz_task_switch(current);
+ return rq;
}
#ifdef CONFIG_SMP
@@ -2277,29 +2304,22 @@ static inline void post_schedule(struct rq *rq)
asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
- struct rq *rq = this_rq();
-
- finish_task_switch(rq, prev);
+ struct rq *rq;
- /*
- * FIXME: do we need to worry about rq being invalidated by the
- * task_switch?
- */
+ /* finish_task_switch() drops rq->lock and enables preemtion */
+ preempt_disable();
+ rq = finish_task_switch(prev);
post_schedule(rq);
-
-#ifdef __ARCH_WANT_UNLOCKED_CTXSW
- /* In this case, finish_task_switch does not reenable preemption */
preempt_enable();
-#endif
+
if (current->set_child_tid)
put_user(task_pid_vnr(current), current->set_child_tid);
}
/*
- * context_switch - switch to the new MM and the new
- * thread's register state.
+ * context_switch - switch to the new MM and the new thread's register state.
*/
-static inline void
+static inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
@@ -2333,21 +2353,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-#endif
context_tracking_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
-
barrier();
- /*
- * this_rq must be evaluated again because prev may have moved
- * CPUs since it called schedule(), thus the 'rq' on its stack
- * frame will be invalid.
- */
- finish_task_switch(this_rq(), prev);
+
+ return finish_task_switch(prev);
}
/*
@@ -2366,6 +2379,18 @@ unsigned long nr_running(void)
return sum;
}
+/*
+ * Check if only the current task is running on the cpu.
+ */
+bool single_task_running(void)
+{
+ if (cpu_rq(smp_processor_id())->nr_running == 1)
+ return true;
+ else
+ return false;
+}
+EXPORT_SYMBOL(single_task_running);
+
unsigned long long nr_context_switches(void)
{
int i;
@@ -2437,44 +2462,6 @@ EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
/*
- * Return any ns on the sched_clock that have not yet been accounted in
- * @p in case that task is currently running.
- *
- * Called with task_rq_lock() held on @rq.
- */
-static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
-{
- u64 ns = 0;
-
- /*
- * Must be ->curr _and_ ->on_rq. If dequeued, we would
- * project cycles that may never be accounted to this
- * thread, breaking clock_gettime().
- */
- if (task_current(rq, p) && p->on_rq) {
- update_rq_clock(rq);
- ns = rq_clock_task(rq) - p->se.exec_start;
- if ((s64)ns < 0)
- ns = 0;
- }
-
- return ns;
-}
-
-unsigned long long task_delta_exec(struct task_struct *p)
-{
- unsigned long flags;
- struct rq *rq;
- u64 ns = 0;
-
- rq = task_rq_lock(p, &flags);
- ns = do_task_delta_exec(p, rq);
- task_rq_unlock(rq, p, &flags);
-
- return ns;
-}
-
-/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
* pending runtime that have not been accounted yet.
@@ -2483,7 +2470,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
- u64 ns = 0;
+ u64 ns;
#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
/*
@@ -2497,12 +2484,21 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* If we see ->on_cpu without ->on_rq, the task is leaving, and has
* been accounted, so we're correct here as well.
*/
- if (!p->on_cpu || !p->on_rq)
+ if (!p->on_cpu || !task_on_rq_queued(p))
return p->se.sum_exec_runtime;
#endif
rq = task_rq_lock(p, &flags);
- ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+ /*
+ * Must be ->curr _and_ ->on_rq. If dequeued, we would
+ * project cycles that may never be accounted to this
+ * thread, breaking clock_gettime().
+ */
+ if (task_current(rq, p) && task_on_rq_queued(p)) {
+ update_rq_clock(rq);
+ p->sched_class->update_curr(rq);
+ }
+ ns = p->se.sum_exec_runtime;
task_rq_unlock(rq, p, &flags);
return ns;
@@ -2660,6 +2656,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
*/
static inline void schedule_debug(struct task_struct *prev)
{
+#ifdef CONFIG_SCHED_STACK_END_CHECK
+ BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+#endif
/*
* Test if we are atomic. Since do_exit() needs to call into
* schedule() atomically, we ignore that path. Otherwise whine
@@ -2761,7 +2760,7 @@ need_resched:
preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- rcu_note_context_switch(cpu);
+ rcu_note_context_switch();
prev = rq->curr;
schedule_debug(prev);
@@ -2801,7 +2800,7 @@ need_resched:
switch_count = &prev->nvcsw;
}
- if (prev->on_rq || rq->skip_clock_update < 0)
+ if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
update_rq_clock(rq);
next = pick_next_task(rq, prev);
@@ -2814,15 +2813,8 @@ need_resched:
rq->curr = next;
++*switch_count;
- context_switch(rq, prev, next); /* unlocks the rq */
- /*
- * The context switch have flipped the stack from under us
- * and restored the local variables which were saved when
- * this task called schedule() in the past. prev == current
- * is still correct, but it can be moved to another cpu/rq.
- */
- cpu = smp_processor_id();
- rq = cpu_rq(cpu);
+ rq = context_switch(rq, prev, next); /* unlocks the rq */
+ cpu = cpu_of(rq);
} else
raw_spin_unlock_irq(&rq->lock);
@@ -2862,10 +2854,14 @@ asmlinkage __visible void __sched schedule_user(void)
* or we have been woken up remotely but the IPI has not yet arrived,
* we haven't yet exited the RCU idle mode. Do it here manually until
* we find a better solution.
+ *
+ * NB: There are buggy callers of this function. Ideally we
+ * should warn if prev_state != IN_USER, but that will trigger
+ * too frequently to make sense yet.
*/
- user_exit();
+ enum ctx_state prev_state = exception_enter();
schedule();
- user_enter();
+ exception_exit(prev_state);
}
#endif
@@ -2910,6 +2906,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
+
+#ifdef CONFIG_CONTEXT_TRACKING
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+{
+ enum ctx_state prev_ctx;
+
+ if (likely(!preemptible()))
+ return;
+
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ /*
+ * Needs preempt disabled in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
+ */
+ prev_ctx = exception_enter();
+ __schedule();
+ exception_exit(prev_ctx);
+
+ __preempt_count_sub(PREEMPT_ACTIVE);
+ barrier();
+ } while (need_resched());
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_CONTEXT_TRACKING */
+
#endif /* CONFIG_PREEMPT */
/*
@@ -2966,7 +3003,7 @@ EXPORT_SYMBOL(default_wake_function);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- int oldprio, on_rq, running, enqueue_flag = 0;
+ int oldprio, queued, running, enqueue_flag = 0;
struct rq *rq;
const struct sched_class *prev_class;
@@ -2995,12 +3032,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
prev_class = p->sched_class;
- on_rq = p->on_rq;
+ queued = task_on_rq_queued(p);
running = task_current(rq, p);
- if (on_rq)
+ if (queued)
dequeue_task(rq, p, 0);
if (running)
- p->sched_class->put_prev_task(rq, p);
+ put_prev_task(rq, p);
/*
* Boosting condition are:
@@ -3037,7 +3074,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
+ if (queued)
enqueue_task(rq, p, enqueue_flag);
check_class_changed(rq, p, prev_class, oldprio);
@@ -3048,7 +3085,7 @@ out_unlock:
void set_user_nice(struct task_struct *p, long nice)
{
- int old_prio, delta, on_rq;
+ int old_prio, delta, queued;
unsigned long flags;
struct rq *rq;
@@ -3069,8 +3106,8 @@ void set_user_nice(struct task_struct *p, long nice)
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
- on_rq = p->on_rq;
- if (on_rq)
+ queued = task_on_rq_queued(p);
+ if (queued)
dequeue_task(rq, p, 0);
p->static_prio = NICE_TO_PRIO(nice);
@@ -3079,7 +3116,7 @@ void set_user_nice(struct task_struct *p, long nice)
p->prio = effective_prio(p);
delta = p->prio - old_prio;
- if (on_rq) {
+ if (queued) {
enqueue_task(rq, p, 0);
/*
* If the task increased its priority or is running and
@@ -3351,7 +3388,7 @@ static int __sched_setscheduler(struct task_struct *p,
{
int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, on_rq, running;
+ int retval, oldprio, oldpolicy = -1, queued, running;
int policy = attr->sched_policy;
unsigned long flags;
const struct sched_class *prev_class;
@@ -3548,19 +3585,19 @@ change:
return 0;
}
- on_rq = p->on_rq;
+ queued = task_on_rq_queued(p);
running = task_current(rq, p);
- if (on_rq)
+ if (queued)
dequeue_task(rq, p, 0);
if (running)
- p->sched_class->put_prev_task(rq, p);
+ put_prev_task(rq, p);
prev_class = p->sched_class;
__setscheduler(rq, p, attr);
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq) {
+ if (queued) {
/*
* We enqueue to tail when the priority of a task is
* increased (user space view).
@@ -3984,14 +4021,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
rcu_read_lock();
if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
rcu_read_unlock();
- goto out_unlock;
+ goto out_free_new_mask;
}
rcu_read_unlock();
}
retval = security_task_setscheduler(p);
if (retval)
- goto out_unlock;
+ goto out_free_new_mask;
cpuset_cpus_allowed(p, cpus_allowed);
@@ -4004,13 +4041,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
* root_domain.
*/
#ifdef CONFIG_SMP
- if (task_has_dl_policy(p)) {
- const struct cpumask *span = task_rq(p)->rd->span;
-
- if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
+ if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+ rcu_read_lock();
+ if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
retval = -EBUSY;
- goto out_unlock;
+ rcu_read_unlock();
+ goto out_free_new_mask;
}
+ rcu_read_unlock();
}
#endif
again:
@@ -4028,7 +4066,7 @@ again:
goto again;
}
}
-out_unlock:
+out_free_new_mask:
free_cpumask_var(new_mask);
out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
@@ -4489,8 +4527,10 @@ void sched_show_task(struct task_struct *p)
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
+ ppid = 0;
rcu_read_lock();
- ppid = task_pid_nr(rcu_dereference(p->real_parent));
+ if (pid_alive(p))
+ ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
task_pid_nr(p), ppid,
@@ -4512,7 +4552,7 @@ void show_state_filter(unsigned long state_filter)
" task PC stack pid father\n");
#endif
rcu_read_lock();
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
/*
* reset the NMI-timeout, listing all files on a slow
* console might take a lot of time:
@@ -4520,7 +4560,7 @@ void show_state_filter(unsigned long state_filter)
touch_nmi_watchdog();
if (!state_filter || (p->state & state_filter))
sched_show_task(p);
- } while_each_thread(g, p);
+ }
touch_all_softlockup_watchdogs();
@@ -4575,7 +4615,7 @@ void init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->curr = rq->idle = idle;
- idle->on_rq = 1;
+ idle->on_rq = TASK_ON_RQ_QUEUED;
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
@@ -4595,7 +4635,109 @@ void init_idle(struct task_struct *idle, int cpu)
#endif
}
+int cpuset_cpumask_can_shrink(const struct cpumask *cur,
+ const struct cpumask *trial)
+{
+ int ret = 1, trial_cpus;
+ struct dl_bw *cur_dl_b;
+ unsigned long flags;
+
+ rcu_read_lock_sched();
+ cur_dl_b = dl_bw_of(cpumask_any(cur));
+ trial_cpus = cpumask_weight(trial);
+
+ raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
+ if (cur_dl_b->bw != -1 &&
+ cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+ ret = 0;
+ raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ return ret;
+}
+
+int task_can_attach(struct task_struct *p,
+ const struct cpumask *cs_cpus_allowed)
+{
+ int ret = 0;
+
+ /*
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their cpu
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_allowed may be changed.
+ */
+ if (p->flags & PF_NO_SETAFFINITY) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+#ifdef CONFIG_SMP
+ if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
+ cs_cpus_allowed)) {
+ unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
+ cs_cpus_allowed);
+ struct dl_bw *dl_b;
+ bool overflow;
+ int cpus;
+ unsigned long flags;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(dest_cpu);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(dest_cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+ if (overflow)
+ ret = -EBUSY;
+ else {
+ /*
+ * We reserve space for this task in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, p->dl.dl_bw);
+ }
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ }
+#endif
+out:
+ return ret;
+}
+
#ifdef CONFIG_SMP
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+{
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_held(&rq->lock);
+
+ dequeue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ set_task_cpu(p, new_cpu);
+ raw_spin_unlock(&rq->lock);
+
+ rq = cpu_rq(new_cpu);
+
+ raw_spin_lock(&rq->lock);
+ BUG_ON(task_cpu(p) != new_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ enqueue_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+
+ return rq;
+}
+
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
if (p->sched_class && p->sched_class->set_cpus_allowed)
@@ -4652,14 +4794,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (p->on_rq) {
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, p, &flags);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
tlb_migrate_finish(p->mm);
return 0;
- }
+ } else if (task_on_rq_queued(p))
+ rq = move_queued_task(p, dest_cpu);
out:
task_rq_unlock(rq, p, &flags);
@@ -4680,20 +4823,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
*/
static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
- struct rq *rq_dest, *rq_src;
+ struct rq *rq;
int ret = 0;
if (unlikely(!cpu_active(dest_cpu)))
return ret;
- rq_src = cpu_rq(src_cpu);
- rq_dest = cpu_rq(dest_cpu);
+ rq = cpu_rq(src_cpu);
raw_spin_lock(&p->pi_lock);
- double_rq_lock(rq_src, rq_dest);
+ raw_spin_lock(&rq->lock);
/* Already moved. */
if (task_cpu(p) != src_cpu)
goto done;
+
/* Affinity changed (again). */
if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
goto fail;
@@ -4702,16 +4845,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
* If we're not on a rq, the next wake-up will ensure we're
* placed properly.
*/
- if (p->on_rq) {
- dequeue_task(rq_src, p, 0);
- set_task_cpu(p, dest_cpu);
- enqueue_task(rq_dest, p, 0);
- check_preempt_curr(rq_dest, p, 0);
- }
+ if (task_on_rq_queued(p))
+ rq = move_queued_task(p, dest_cpu);
done:
ret = 1;
fail:
- double_rq_unlock(rq_src, rq_dest);
+ raw_spin_unlock(&rq->lock);
raw_spin_unlock(&p->pi_lock);
return ret;
}
@@ -4743,22 +4882,22 @@ void sched_setnuma(struct task_struct *p, int nid)
{
struct rq *rq;
unsigned long flags;
- bool on_rq, running;
+ bool queued, running;
rq = task_rq_lock(p, &flags);
- on_rq = p->on_rq;
+ queued = task_on_rq_queued(p);
running = task_current(rq, p);
- if (on_rq)
+ if (queued)
dequeue_task(rq, p, 0);
if (running)
- p->sched_class->put_prev_task(rq, p);
+ put_prev_task(rq, p);
p->numa_preferred_nid = nid;
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
+ if (queued)
enqueue_task(rq, p, 0);
task_rq_unlock(rq, p, &flags);
}
@@ -4778,6 +4917,12 @@ static int migration_cpu_stop(void *data)
* be on another cpu but it doesn't matter.
*/
local_irq_disable();
+ /*
+ * We need to explicitly wake pending tasks before running
+ * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+ */
+ sched_ttwu_pending();
__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
local_irq_enable();
return 0;
@@ -5188,6 +5333,7 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
{
unsigned long flags;
long cpu = (long)hcpu;
+ struct dl_bw *dl_b;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
@@ -5195,15 +5341,19 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
/* explicitly allow suspend */
if (!(action & CPU_TASKS_FROZEN)) {
- struct dl_bw *dl_b = dl_bw_of(cpu);
bool overflow;
int cpus;
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+
raw_spin_lock_irqsave(&dl_b->lock, flags);
cpus = dl_bw_cpus(cpu);
overflow = __dl_overflow(dl_b, cpus, 0, 0);
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
if (overflow)
return notifier_from_errno(-EBUSY);
}
@@ -5746,7 +5896,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
const struct cpumask *span = sched_domain_span(sd);
struct cpumask *covered = sched_domains_tmpmask;
struct sd_data *sdd = sd->private;
- struct sched_domain *child;
+ struct sched_domain *sibling;
int i;
cpumask_clear(covered);
@@ -5757,10 +5907,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
if (cpumask_test_cpu(i, covered))
continue;
- child = *per_cpu_ptr(sdd->sd, i);
+ sibling = *per_cpu_ptr(sdd->sd, i);
/* See the comment near build_group_mask(). */
- if (!cpumask_test_cpu(i, sched_domain_span(child)))
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
continue;
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
@@ -5770,10 +5920,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
goto fail;
sg_span = sched_group_cpus(sg);
- if (child->child) {
- child = child->child;
- cpumask_copy(sg_span, sched_domain_span(child));
- } else
+ if (sibling->child)
+ cpumask_copy(sg_span, sched_domain_span(sibling->child));
+ else
cpumask_set_cpu(i, sg_span);
cpumask_or(covered, covered, sg_span);
@@ -6011,7 +6160,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
#ifdef CONFIG_NUMA
static int sched_domains_numa_levels;
+enum numa_topology_type sched_numa_topology_type;
static int *sched_domains_numa_distance;
+int sched_max_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
#endif
@@ -6183,7 +6334,7 @@ static void sched_numa_warn(const char *str)
printk(KERN_WARNING "\n");
}
-static bool find_numa_distance(int distance)
+bool find_numa_distance(int distance)
{
int i;
@@ -6198,6 +6349,56 @@ static bool find_numa_distance(int distance)
return false;
}
+/*
+ * A system can have three types of NUMA topology:
+ * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
+ * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
+ * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
+ *
+ * The difference between a glueless mesh topology and a backplane
+ * topology lies in whether communication between not directly
+ * connected nodes goes through intermediary nodes (where programs
+ * could run), or through backplane controllers. This affects
+ * placement of programs.
+ *
+ * The type of topology can be discerned with the following tests:
+ * - If the maximum distance between any nodes is 1 hop, the system
+ * is directly connected.
+ * - If for two nodes A and B, located N > 1 hops away from each other,
+ * there is an intermediary node C, which is < N hops away from both
+ * nodes A and B, the system is a glueless mesh.
+ */
+static void init_numa_topology_type(void)
+{
+ int a, b, c, n;
+
+ n = sched_max_numa_distance;
+
+ if (n <= 1)
+ sched_numa_topology_type = NUMA_DIRECT;
+
+ for_each_online_node(a) {
+ for_each_online_node(b) {
+ /* Find two nodes furthest removed from each other. */
+ if (node_distance(a, b) < n)
+ continue;
+
+ /* Is there an intermediary node between a and b? */
+ for_each_online_node(c) {
+ if (node_distance(a, c) < n &&
+ node_distance(b, c) < n) {
+ sched_numa_topology_type =
+ NUMA_GLUELESS_MESH;
+ return;
+ }
+ }
+
+ sched_numa_topology_type = NUMA_BACKPLANE;
+ return;
+ }
+ }
+}
+
static void sched_init_numa(void)
{
int next_distance, curr_distance = node_distance(0, 0);
@@ -6251,6 +6452,10 @@ static void sched_init_numa(void)
if (!sched_debug())
break;
}
+
+ if (!level)
+ return;
+
/*
* 'level' contains the number of unique distances, excluding the
* identity distance node_distance(i,i).
@@ -6330,6 +6535,9 @@ static void sched_init_numa(void)
sched_domain_topology = tl;
sched_domains_numa_levels = level;
+ sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+
+ init_numa_topology_type();
}
static void sched_domains_numa_masks_set(int cpu)
@@ -6905,9 +7113,6 @@ void __init sched_init(void)
#ifdef CONFIG_RT_GROUP_SCHED
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
-#ifdef CONFIG_CPUMASK_OFFSTACK
- alloc_size += num_possible_cpus() * cpumask_size();
-#endif
if (alloc_size) {
ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
@@ -6927,13 +7132,13 @@ void __init sched_init(void)
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_RT_GROUP_SCHED */
+ }
#ifdef CONFIG_CPUMASK_OFFSTACK
- for_each_possible_cpu(i) {
- per_cpu(load_balance_mask, i) = (void *)ptr;
- ptr += cpumask_size();
- }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
+ for_each_possible_cpu(i) {
+ per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
+ cpumask_size(), GFP_KERNEL, cpu_to_node(i));
}
+#endif /* CONFIG_CPUMASK_OFFSTACK */
init_rt_bandwidth(&def_rt_bandwidth,
global_rt_period(), global_rt_runtime());
@@ -7082,6 +7287,25 @@ static inline int preempt_count_equals(int preempt_offset)
void __might_sleep(const char *file, int line, int preempt_offset)
{
+ /*
+ * Blocking primitives will set (and therefore destroy) current->state,
+ * since we will exit with TASK_RUNNING make sure we enter with it,
+ * otherwise we will destroy state.
+ */
+ if (WARN_ONCE(current->state != TASK_RUNNING,
+ "do not call blocking ops when !TASK_RUNNING; "
+ "state=%lx set at [<%p>] %pS\n",
+ current->state,
+ (void *)current->task_state_change,
+ (void *)current->task_state_change))
+ __set_current_state(TASK_RUNNING);
+
+ ___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
static unsigned long prev_jiffy; /* ratelimiting */
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7113,7 +7337,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
#endif
dump_stack();
}
-EXPORT_SYMBOL(__might_sleep);
+EXPORT_SYMBOL(___might_sleep);
#endif
#ifdef CONFIG_MAGIC_SYSRQ
@@ -7124,13 +7348,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
.sched_policy = SCHED_NORMAL,
};
int old_prio = p->prio;
- int on_rq;
+ int queued;
- on_rq = p->on_rq;
- if (on_rq)
+ queued = task_on_rq_queued(p);
+ if (queued)
dequeue_task(rq, p, 0);
__setscheduler(rq, p, &attr);
- if (on_rq) {
+ if (queued) {
enqueue_task(rq, p, 0);
resched_curr(rq);
}
@@ -7144,12 +7368,12 @@ void normalize_rt_tasks(void)
unsigned long flags;
struct rq *rq;
- read_lock_irqsave(&tasklist_lock, flags);
- do_each_thread(g, p) {
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
/*
* Only normalize user tasks:
*/
- if (!p->mm)
+ if (p->flags & PF_KTHREAD)
continue;
p->se.exec_start = 0;
@@ -7164,21 +7388,16 @@ void normalize_rt_tasks(void)
* Renice negative nice level userspace
* tasks back to 0:
*/
- if (task_nice(p) < 0 && p->mm)
+ if (task_nice(p) < 0)
set_user_nice(p, 0);
continue;
}
- raw_spin_lock(&p->pi_lock);
- rq = __task_rq_lock(p);
-
+ rq = task_rq_lock(p, &flags);
normalize_task(rq, p);
-
- __task_rq_unlock(rq);
- raw_spin_unlock(&p->pi_lock);
- } while_each_thread(g, p);
-
- read_unlock_irqrestore(&tasklist_lock, flags);
+ task_rq_unlock(rq, p, &flags);
+ }
+ read_unlock(&tasklist_lock);
}
#endif /* CONFIG_MAGIC_SYSRQ */
@@ -7318,36 +7537,40 @@ void sched_offline_group(struct task_group *tg)
void sched_move_task(struct task_struct *tsk)
{
struct task_group *tg;
- int on_rq, running;
+ int queued, running;
unsigned long flags;
struct rq *rq;
rq = task_rq_lock(tsk, &flags);
running = task_current(rq, tsk);
- on_rq = tsk->on_rq;
+ queued = task_on_rq_queued(tsk);
- if (on_rq)
+ if (queued)
dequeue_task(rq, tsk, 0);
if (unlikely(running))
- tsk->sched_class->put_prev_task(rq, tsk);
+ put_prev_task(rq, tsk);
- tg = container_of(task_css_check(tsk, cpu_cgrp_id,
- lockdep_is_held(&tsk->sighand->siglock)),
+ /*
+ * All callers are synchronized by task_rq_lock(); we do not use RCU
+ * which is pointless here. Thus, we pass "true" to task_css_check()
+ * to prevent lockdep warnings.
+ */
+ tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk, on_rq);
+ tsk->sched_class->task_move_group(tsk, queued);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
- if (on_rq)
+ if (queued)
enqueue_task(rq, tsk, 0);
task_rq_unlock(rq, tsk, &flags);
@@ -7365,10 +7588,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
{
struct task_struct *g, *p;
- do_each_thread(g, p) {
- if (rt_task(p) && task_rq(p)->rt.tg == tg)
+ for_each_process_thread(g, p) {
+ if (rt_task(p) && task_group(p) == tg)
return 1;
- } while_each_thread(g, p);
+ }
return 0;
}
@@ -7577,6 +7800,7 @@ static int sched_dl_global_constraints(void)
u64 runtime = global_rt_runtime();
u64 period = global_rt_period();
u64 new_bw = to_ratio(period, runtime);
+ struct dl_bw *dl_b;
int cpu, ret = 0;
unsigned long flags;
@@ -7590,13 +7814,16 @@ static int sched_dl_global_constraints(void)
* solutions is welcome!
*/
for_each_possible_cpu(cpu) {
- struct dl_bw *dl_b = dl_bw_of(cpu);
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
if (new_bw < dl_b->total_bw)
ret = -EBUSY;
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
if (ret)
break;
}
@@ -7607,6 +7834,7 @@ static int sched_dl_global_constraints(void)
static void sched_dl_do_global(void)
{
u64 new_bw = -1;
+ struct dl_bw *dl_b;
int cpu;
unsigned long flags;
@@ -7620,11 +7848,14 @@ static void sched_dl_do_global(void)
* FIXME: As above...
*/
for_each_possible_cpu(cpu) {
- struct dl_bw *dl_b = dl_bw_of(cpu);
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
dl_b->bw = new_bw;
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
}
}
@@ -7754,6 +7985,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+ sched_move_task(task);
+}
+
static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
@@ -8005,7 +8241,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
quota = normalize_cfs_quota(tg, d);
- parent_quota = parent_b->hierarchal_quota;
+ parent_quota = parent_b->hierarchical_quota;
/*
* ensure max(child_quota) <= parent_quota, inherit when no
@@ -8016,7 +8252,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
else if (parent_quota != RUNTIME_INF && quota > parent_quota)
return -EINVAL;
}
- cfs_b->hierarchal_quota = quota;
+ cfs_b->hierarchical_quota = quota;
return 0;
}
@@ -8126,6 +8362,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_free = cpu_cgroup_css_free,
.css_online = cpu_cgroup_css_online,
.css_offline = cpu_cgroup_css_offline,
+ .fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index bd95963dae80..539ca3ce071b 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,9 +107,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
int best_cpu = -1;
const struct sched_dl_entity *dl_se = &p->dl;
- if (later_mask && cpumask_and(later_mask, cp->free_cpus,
- &p->cpus_allowed) && cpumask_and(later_mask,
- later_mask, cpu_active_mask)) {
+ if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
best_cpu = cpumask_any(later_mask);
goto out;
} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 538c9796ad4a..020039bd1326 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
int cpudl_init(struct cpudl *cp);
void cpudl_cleanup(struct cpudl *cp);
-#else
-#define cpudl_set(cp, cpu, dl) do { } while (0)
-#define cpudl_init() do { } while (0)
#endif /* CONFIG_SMP */
#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 6b033347fdfd..63cbb9ca0496 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp,
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
-#else
-#define cpupri_set(cp, cpu, pri) do { } while (0)
-#define cpupri_init() do { } while (0)
#endif
#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 72fdf06ef865..8394b1ee600c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
struct signal_struct *sig = tsk->signal;
cputime_t utime, stime;
struct task_struct *t;
-
- times->utime = sig->utime;
- times->stime = sig->stime;
- times->sum_exec_runtime = sig->sum_sched_runtime;
+ unsigned int seq, nextseq;
+ unsigned long flags;
rcu_read_lock();
- /* make sure we can trust tsk->thread_group list */
- if (!likely(pid_alive(tsk)))
- goto out;
-
- t = tsk;
+ /* Attempt a lockless read on the first round. */
+ nextseq = 0;
do {
- task_cputime(t, &utime, &stime);
- times->utime += utime;
- times->stime += stime;
- times->sum_exec_runtime += task_sched_runtime(t);
- } while_each_thread(tsk, t);
-out:
+ seq = nextseq;
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+ times->utime = sig->utime;
+ times->stime = sig->stime;
+ times->sum_exec_runtime = sig->sum_sched_runtime;
+
+ for_each_thread(tsk, t) {
+ task_cputime(t, &utime, &stime);
+ times->utime += utime;
+ times->stime += stime;
+ times->sum_exec_runtime += task_sched_runtime(t);
+ }
+ /* If lockless access failed, take the lock. */
+ nextseq = 1;
+ } while (need_seqretry(&sig->stats_lock, seq));
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
rcu_read_unlock();
}
@@ -550,6 +555,23 @@ drop_precision:
}
/*
+ * Atomically advance counter to the new value. Interrupts, vcpu
+ * scheduling, and scaling inaccuracies can cause cputime_advance
+ * to be occasionally called with a new value smaller than counter.
+ * Let's enforce atomicity.
+ *
+ * Normally a caller will only go through this loop once, or not
+ * at all in case a previous caller updated counter the same jiffy.
+ */
+static void cputime_advance(cputime_t *counter, cputime_t new)
+{
+ cputime_t old;
+
+ while (new > (old = ACCESS_ONCE(*counter)))
+ cmpxchg_cputime(counter, old, new);
+}
+
+/*
* Adjust tick based cputime random precision against scheduler
* runtime accounting.
*/
@@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr,
utime = rtime - stime;
}
- /*
- * If the tick based count grows faster than the scheduler one,
- * the result of the scaling may go backward.
- * Let's enforce monotonicity.
- */
- prev->stime = max(prev->stime, stime);
- prev->utime = max(prev->utime, utime);
+ cputime_advance(&prev->stime, stime);
+ cputime_advance(&prev->utime, utime);
out:
*ut = prev->utime;
@@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
-/*
- * Must be called with siglock held.
- */
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
struct task_cputime cputime;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 255ce138b652..b52092f2636d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -518,21 +518,29 @@ again:
}
/*
- * We need to take care of a possible races here. In fact, the
- * task might have changed its scheduling policy to something
- * different from SCHED_DEADLINE or changed its reservation
- * parameters (through sched_setattr()).
+ * We need to take care of several possible races here:
+ *
+ * - the task might have changed its scheduling policy
+ * to something different than SCHED_DEADLINE
+ * - the task might have changed its reservation parameters
+ * (through sched_setattr())
+ * - the task might have been boosted by someone else and
+ * might be in the boosting/deboosting path
+ *
+ * In all this cases we bail out, as the task is already
+ * in the runqueue or is going to be enqueued back anyway.
*/
- if (!dl_task(p) || dl_se->dl_new)
+ if (!dl_task(p) || dl_se->dl_new ||
+ dl_se->dl_boosted || !dl_se->dl_throttled)
goto unlock;
sched_clock_tick();
update_rq_clock(rq);
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
- if (p->on_rq) {
+ if (task_on_rq_queued(p)) {
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
- if (task_has_dl_policy(rq->curr))
+ if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
@@ -555,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
- if (hrtimer_active(timer)) {
- hrtimer_try_to_cancel(timer);
- return;
- }
-
hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
timer->function = dl_task_timer;
}
@@ -567,24 +570,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
static
int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
{
- int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq));
- int rorun = dl_se->runtime <= 0;
-
- if (!rorun && !dmiss)
- return 0;
-
- /*
- * If we are beyond our current deadline and we are still
- * executing, then we have already used some of the runtime of
- * the next instance. Thus, if we do not account that, we are
- * stealing bandwidth from the system at each deadline miss!
- */
- if (dmiss) {
- dl_se->runtime = rorun ? dl_se->runtime : 0;
- dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
- }
-
- return 1;
+ return (dl_se->runtime <= 0);
}
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
@@ -625,7 +611,7 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec);
- dl_se->runtime -= delta_exec;
+ dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
if (dl_runtime_exceeded(rq, dl_se)) {
__dequeue_task_dl(rq, curr, 0);
if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
@@ -823,10 +809,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
* parameters of the task might need updating. Otherwise,
* we want a replenishment of its runtime.
*/
- if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH)
- replenish_dl_entity(dl_se, pi_se);
- else
+ if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
update_dl_entity(dl_se, pi_se);
+ else if (flags & ENQUEUE_REPLENISH)
+ replenish_dl_entity(dl_se, pi_se);
__enqueue_dl_entity(dl_se);
}
@@ -847,8 +833,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* smaller than our one... OTW we keep our runtime and
* deadline.
*/
- if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
+ if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
pi_se = &pi_task->dl;
+ } else if (!dl_prio(p->normal_prio)) {
+ /*
+ * Special case in which we have a !SCHED_DEADLINE task
+ * that is going to be deboosted, but exceedes its
+ * runtime while doing so. No point in replenishing
+ * it, as it's going to return back to its original
+ * scheduling class after this.
+ */
+ BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
+ return;
+ }
/*
* If p is throttled, we do nothing. In fact, if it exhausted
@@ -914,7 +911,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
struct task_struct *curr;
struct rq *rq;
- if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+ if (sd_flag != SD_BALANCE_WAKE)
goto out;
rq = cpu_rq(cpu);
@@ -997,10 +994,11 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
#ifdef CONFIG_SCHED_HRTICK
static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
{
- s64 delta = p->dl.dl_runtime - p->dl.runtime;
-
- if (delta > 10000)
- hrtick_start(rq, p->dl.runtime);
+ hrtick_start(rq, p->dl.runtime);
+}
+#else /* !CONFIG_SCHED_HRTICK */
+static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+{
}
#endif
@@ -1030,7 +1028,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
* means a stop task can slip in, in which case we need to
* re-start task selection.
*/
- if (rq->stop && rq->stop->on_rq)
+ if (rq->stop && task_on_rq_queued(rq->stop))
return RETRY_TASK;
}
@@ -1055,10 +1053,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
/* Running task will never be pushed. */
dequeue_pushable_dl_task(rq, p);
-#ifdef CONFIG_SCHED_HRTICK
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
-#endif
set_post_schedule(rq);
@@ -1077,10 +1073,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
-#ifdef CONFIG_SCHED_HRTICK
if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
start_hrtick_dl(rq, p);
-#endif
}
static void task_fork_dl(struct task_struct *p)
@@ -1124,10 +1118,8 @@ static void set_curr_task_dl(struct rq *rq)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
- (p->nr_cpus_allowed > 1))
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
return 1;
-
return 0;
}
@@ -1158,7 +1150,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
static int find_later_rq(struct task_struct *task)
{
struct sched_domain *sd;
- struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl);
+ struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
int this_cpu = smp_processor_id();
int best_cpu, cpu = task_cpu(task);
@@ -1169,6 +1161,13 @@ static int find_later_rq(struct task_struct *task)
if (task->nr_cpus_allowed == 1)
return -1;
+ /*
+ * We have to consider system topology and task affinity
+ * first, then we can look for a suitable cpu.
+ */
+ cpumask_copy(later_mask, task_rq(task)->rd->span);
+ cpumask_and(later_mask, later_mask, cpu_active_mask);
+ cpumask_and(later_mask, later_mask, &task->cpus_allowed);
best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
task, later_mask);
if (best_cpu == -1)
@@ -1257,7 +1256,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(later_rq->cpu,
&task->cpus_allowed) ||
- task_running(rq, task) || !task->on_rq)) {
+ task_running(rq, task) ||
+ !task_on_rq_queued(task))) {
double_unlock_balance(rq, later_rq);
later_rq = NULL;
break;
@@ -1296,7 +1296,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
BUG_ON(task_current(rq, p));
BUG_ON(p->nr_cpus_allowed <= 1);
- BUG_ON(!p->on_rq);
+ BUG_ON(!task_on_rq_queued(p));
BUG_ON(!dl_task(p));
return p;
@@ -1311,6 +1311,7 @@ static int push_dl_task(struct rq *rq)
{
struct task_struct *next_task;
struct rq *later_rq;
+ int ret = 0;
if (!rq->dl.overloaded)
return 0;
@@ -1356,7 +1357,6 @@ retry:
* The task is still there. We don't try
* again, some other cpu will pull it when ready.
*/
- dequeue_pushable_dl_task(rq, next_task);
goto out;
}
@@ -1372,6 +1372,7 @@ retry:
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, later_rq->cpu);
activate_task(later_rq, next_task, 0);
+ ret = 1;
resched_curr(later_rq);
@@ -1380,7 +1381,7 @@ retry:
out:
put_task_struct(next_task);
- return 1;
+ return ret;
}
static void push_dl_tasks(struct rq *rq)
@@ -1443,7 +1444,7 @@ static int pull_dl_task(struct rq *this_rq)
dl_time_before(p->dl.deadline,
this_rq->dl.earliest_dl.curr))) {
WARN_ON(p == src_rq->curr);
- WARN_ON(!p->on_rq);
+ WARN_ON(!task_on_rq_queued(p));
/*
* Then we pull iff p has actually an earlier
@@ -1486,7 +1487,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
p->nr_cpus_allowed > 1 &&
dl_task(rq->curr) &&
(rq->curr->nr_cpus_allowed < 2 ||
- dl_entity_preempt(&rq->curr->dl, &p->dl))) {
+ !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
push_dl_tasks(rq);
}
}
@@ -1495,10 +1496,33 @@ static void set_cpus_allowed_dl(struct task_struct *p,
const struct cpumask *new_mask)
{
struct rq *rq;
+ struct root_domain *src_rd;
int weight;
BUG_ON(!dl_task(p));
+ rq = task_rq(p);
+ src_rd = rq->rd;
+ /*
+ * Migrating a SCHED_DEADLINE task between exclusive
+ * cpusets (different root_domains) entails a bandwidth
+ * update. We already made space for us in the destination
+ * domain (see cpuset_can_attach()).
+ */
+ if (!cpumask_intersects(src_rd->span, new_mask)) {
+ struct dl_bw *src_dl_b;
+
+ src_dl_b = dl_bw_of(cpu_of(rq));
+ /*
+ * We now free resources of the root_domain we are migrating
+ * off. In the worst case, sched_setattr() may temporary fail
+ * until we complete the update.
+ */
+ raw_spin_lock(&src_dl_b->lock);
+ __dl_clear(src_dl_b, p->dl.dl_bw);
+ raw_spin_unlock(&src_dl_b->lock);
+ }
+
/*
* Update only if the task is actually running (i.e.,
* it is on the rq AND it is not throttled).
@@ -1515,8 +1539,6 @@ static void set_cpus_allowed_dl(struct task_struct *p,
if ((p->nr_cpus_allowed > 1) == (weight > 1))
return;
- rq = task_rq(p);
-
/*
* The process used to be able to migrate OR it can now migrate
*/
@@ -1564,20 +1586,48 @@ void init_sched_dl_class(void)
#endif /* CONFIG_SMP */
+/*
+ * Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
+ */
+static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
+{
+ struct hrtimer *dl_timer = &p->dl.dl_timer;
+
+ /* Nobody will change task's class if pi_lock is held */
+ lockdep_assert_held(&p->pi_lock);
+
+ if (hrtimer_active(dl_timer)) {
+ int ret = hrtimer_try_to_cancel(dl_timer);
+
+ if (unlikely(ret == -1)) {
+ /*
+ * Note, p may migrate OR new deadline tasks
+ * may appear in rq when we are unlocking it.
+ * A caller of us must be fine with that.
+ */
+ raw_spin_unlock(&rq->lock);
+ hrtimer_cancel(dl_timer);
+ raw_spin_lock(&rq->lock);
+ }
+ }
+}
+
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
- if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
- hrtimer_try_to_cancel(&p->dl.dl_timer);
+ cancel_dl_timer(rq, p);
+
+ __dl_clear_params(p);
-#ifdef CONFIG_SMP
/*
* Since this might be the only -deadline task on the rq,
* this is the right place to try to pull some other one
* from an overloaded cpu, if any.
*/
- if (!rq->dl.dl_nr_running)
- pull_dl_task(rq);
-#endif
+ if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+ return;
+
+ if (pull_dl_task(rq))
+ resched_curr(rq);
}
/*
@@ -1596,14 +1646,19 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (unlikely(p->dl.dl_throttled))
return;
- if (p->on_rq && rq->curr != p) {
+ if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
+ if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
+ push_dl_task(rq) && rq != task_rq(p))
/* Only reschedule if pushing failed */
check_resched = 0;
#endif /* CONFIG_SMP */
- if (check_resched && task_has_dl_policy(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
+ if (check_resched) {
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+ }
}
}
@@ -1614,7 +1669,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
static void prio_changed_dl(struct rq *rq, struct task_struct *p,
int oldprio)
{
- if (p->on_rq || rq->curr == p) {
+ if (task_on_rq_queued(p) || rq->curr == p) {
#ifdef CONFIG_SMP
/*
* This might be too much, but unfortunately
@@ -1673,4 +1728,15 @@ const struct sched_class dl_sched_class = {
.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
.switched_to = switched_to_dl,
+
+ .update_curr = update_curr_dl,
};
+
+#ifdef CONFIG_SCHED_DEBUG
+extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
+
+void print_dl_stats(struct seq_file *m, int cpu)
+{
+ print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
+}
+#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 627b3c34b821..92cc52001e74 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -150,7 +150,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
{
struct task_struct *g, *p;
- unsigned long flags;
SEQ_printf(m,
"\nrunnable tasks:\n"
@@ -159,16 +158,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
"------------------------------------------------------"
"----------------------------------------------------\n");
- read_lock_irqsave(&tasklist_lock, flags);
-
- do_each_thread(g, p) {
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
if (task_cpu(p) != rq_cpu)
continue;
print_task(m, rq, p);
- } while_each_thread(g, p);
-
- read_unlock_irqrestore(&tasklist_lock, flags);
+ }
+ rcu_read_unlock();
}
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
@@ -264,6 +261,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
#undef P
}
+void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
+{
+ SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
+ SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+}
+
extern __read_mostly int sched_clock_running;
static void print_cpu(struct seq_file *m, int cpu)
@@ -332,10 +335,9 @@ do { \
spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
+ print_dl_stats(m, cpu);
- rcu_read_lock();
print_rq(m, rq, cpu);
- rcu_read_unlock();
spin_unlock_irqrestore(&sched_debug_lock, flags);
SEQ_printf(m, "\n");
}
@@ -533,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
unsigned long nr_faults = -1;
int cpu_current, home_node;
- if (p->numa_faults_memory)
- nr_faults = p->numa_faults_memory[2*node + i];
+ if (p->numa_faults)
+ nr_faults = p->numa_faults[2*node + i];
cpu_current = !i ? (task_node(p) == node) :
(pol && node_isset(node, pol->v.nodes));
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bfa3c86d0d68..40667cbf371b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,7 @@
#include <linux/latencytop.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
+#include <linux/cpuidle.h>
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
@@ -665,6 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_SMP
+static int select_idle_sibling(struct task_struct *p, int cpu);
static unsigned long task_h_load(struct task_struct *p);
static inline void __update_task_entity_contrib(struct sched_entity *se);
@@ -724,6 +726,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
+static void update_curr_fair(struct rq *rq)
+{
+ update_curr(cfs_rq_of(&rq->curr->se));
+}
+
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -826,11 +833,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
static unsigned int task_scan_min(struct task_struct *p)
{
+ unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
- if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
- windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+ if (scan_size < MAX_SCAN_WINDOW)
+ windows = MAX_SCAN_WINDOW / scan_size;
floor = 1000 / windows;
scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
@@ -865,7 +873,6 @@ struct numa_group {
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
- struct list_head task_list;
struct rcu_head rcu;
nodemask_t active_nodes;
@@ -893,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p)
return p->numa_group ? p->numa_group->gid : 0;
}
-static inline int task_faults_idx(int nid, int priv)
+/*
+ * The averaged statistics, shared & private, memory & cpu,
+ * occupy the first half of the array. The second half of the
+ * array is for current counters, which are averaged into the
+ * first set by task_numa_placement.
+ */
+static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
{
- return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
+ return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
}
static inline unsigned long task_faults(struct task_struct *p, int nid)
{
- if (!p->numa_faults_memory)
+ if (!p->numa_faults)
return 0;
- return p->numa_faults_memory[task_faults_idx(nid, 0)] +
- p->numa_faults_memory[task_faults_idx(nid, 1)];
+ return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -912,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
if (!p->numa_group)
return 0;
- return p->numa_group->faults[task_faults_idx(nid, 0)] +
- p->numa_group->faults[task_faults_idx(nid, 1)];
+ return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
{
- return group->faults_cpu[task_faults_idx(nid, 0)] +
- group->faults_cpu[task_faults_idx(nid, 1)];
+ return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
+ group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
+}
+
+/* Handle placement on systems where not all nodes are directly connected. */
+static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
+ int maxdist, bool task)
+{
+ unsigned long score = 0;
+ int node;
+
+ /*
+ * All nodes are directly connected, and the same distance
+ * from each other. No need for fancy placement algorithms.
+ */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return 0;
+
+ /*
+ * This code is called for each node, introducing N^2 complexity,
+ * which should be ok given the number of nodes rarely exceeds 8.
+ */
+ for_each_online_node(node) {
+ unsigned long faults;
+ int dist = node_distance(nid, node);
+
+ /*
+ * The furthest away nodes in the system are not interesting
+ * for placement; nid was already counted.
+ */
+ if (dist == sched_max_numa_distance || node == nid)
+ continue;
+
+ /*
+ * On systems with a backplane NUMA topology, compare groups
+ * of nodes, and move tasks towards the group with the most
+ * memory accesses. When comparing two nodes at distance
+ * "hoplimit", only nodes closer by than "hoplimit" are part
+ * of each group. Skip other nodes.
+ */
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist > maxdist)
+ continue;
+
+ /* Add up the faults from nearby nodes. */
+ if (task)
+ faults = task_faults(p, node);
+ else
+ faults = group_faults(p, node);
+
+ /*
+ * On systems with a glueless mesh NUMA topology, there are
+ * no fixed "groups of nodes". Instead, nodes that are not
+ * directly connected bounce traffic through intermediate
+ * nodes; a numa_group can occupy any set of nodes.
+ * The further away a node is, the less the faults count.
+ * This seems to result in good task placement.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ faults *= (sched_max_numa_distance - dist);
+ faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ }
+
+ score += faults;
+ }
+
+ return score;
}
/*
@@ -928,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
* larger multiplier, in order to group tasks together that are almost
* evenly spread out between numa nodes.
*/
-static inline unsigned long task_weight(struct task_struct *p, int nid)
+static inline unsigned long task_weight(struct task_struct *p, int nid,
+ int dist)
{
- unsigned long total_faults;
+ unsigned long faults, total_faults;
- if (!p->numa_faults_memory)
+ if (!p->numa_faults)
return 0;
total_faults = p->total_numa_faults;
@@ -940,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
if (!total_faults)
return 0;
- return 1000 * task_faults(p, nid) / total_faults;
+ faults = task_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, true);
+
+ return 1000 * faults / total_faults;
}
-static inline unsigned long group_weight(struct task_struct *p, int nid)
+static inline unsigned long group_weight(struct task_struct *p, int nid,
+ int dist)
{
- if (!p->numa_group || !p->numa_group->total_faults)
+ unsigned long faults, total_faults;
+
+ if (!p->numa_group)
return 0;
- return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
+ total_faults = p->numa_group->total_faults;
+
+ if (!total_faults)
+ return 0;
+
+ faults = group_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, false);
+
+ return 1000 * faults / total_faults;
}
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
@@ -1038,7 +1131,8 @@ struct numa_stats {
*/
static void update_numa_stats(struct numa_stats *ns, int nid)
{
- int cpu, cpus = 0;
+ int smt, cpu, cpus = 0;
+ unsigned long capacity;
memset(ns, 0, sizeof(*ns));
for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1062,8 +1156,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
if (!cpus)
return;
- ns->task_capacity =
- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
+ /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
+ smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
+ capacity = cpus / smt; /* cores */
+
+ ns->task_capacity = min_t(unsigned, capacity,
+ DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
}
@@ -1076,6 +1174,7 @@ struct task_numa_env {
struct numa_stats src_stats, dst_stats;
int imbalance_pct;
+ int dist;
struct task_struct *best_task;
long best_imp;
@@ -1155,11 +1254,29 @@ static void task_numa_compare(struct task_numa_env *env,
long load;
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
+ int dist = env->dist;
rcu_read_lock();
- cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+
+ raw_spin_lock_irq(&dst_rq->lock);
+ cur = dst_rq->curr;
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe under RCU read lock.
+ * Note that rcu_read_lock() itself can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
+ raw_spin_unlock_irq(&dst_rq->lock);
+
+ /*
+ * Because we have preemption enabled we can get migrated around and
+ * end try selecting ourselves (current == env->p) as a swap candidate.
+ */
+ if (cur == env->p)
+ goto unlock;
/*
* "imp" is the fault differential for the source task between the
@@ -1178,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env,
* in any group then look only at task weights.
*/
if (cur->numa_group == env->p->numa_group) {
- imp = taskimp + task_weight(cur, env->src_nid) -
- task_weight(cur, env->dst_nid);
+ imp = taskimp + task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
/*
* Add some hysteresis to prevent swapping the
* tasks within a group over tiny differences.
@@ -1193,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env,
* instead.
*/
if (cur->numa_group)
- imp += group_weight(cur, env->src_nid) -
- group_weight(cur, env->dst_nid);
+ imp += group_weight(cur, env->src_nid, dist) -
+ group_weight(cur, env->dst_nid, dist);
else
- imp += task_weight(cur, env->src_nid) -
- task_weight(cur, env->dst_nid);
+ imp += task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
}
}
@@ -1206,7 +1323,7 @@ static void task_numa_compare(struct task_numa_env *env,
if (!cur) {
/* Is there capacity at our destination? */
- if (env->src_stats.has_free_capacity &&
+ if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
!env->dst_stats.has_free_capacity)
goto unlock;
@@ -1252,6 +1369,13 @@ balance:
if (load_too_imbalanced(src_load, dst_load, env))
goto unlock;
+ /*
+ * One idle CPU per node is evaluated for a task numa move.
+ * Call select_idle_sibling to maybe find a better one.
+ */
+ if (!cur)
+ env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+
assign:
task_numa_assign(env, cur, imp);
unlock:
@@ -1289,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p)
};
struct sched_domain *sd;
unsigned long taskweight, groupweight;
- int nid, ret;
+ int nid, ret, dist;
long taskimp, groupimp;
/*
@@ -1317,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p)
return -EINVAL;
}
- taskweight = task_weight(p, env.src_nid);
- groupweight = group_weight(p, env.src_nid);
- update_numa_stats(&env.src_stats, env.src_nid);
env.dst_nid = p->numa_preferred_nid;
- taskimp = task_weight(p, env.dst_nid) - taskweight;
- groupimp = group_weight(p, env.dst_nid) - groupweight;
+ dist = env.dist = node_distance(env.src_nid, env.dst_nid);
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ update_numa_stats(&env.src_stats, env.src_nid);
+ taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
+ groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
update_numa_stats(&env.dst_stats, env.dst_nid);
/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
- /* No space available on the preferred nid. Look elsewhere. */
- if (env.best_cpu == -1) {
+ /*
+ * Look at other nodes in these cases:
+ * - there is no space available on the preferred_nid
+ * - the task is part of a numa_group that is interleaved across
+ * multiple NUMA nodes; in order to better consolidate the group,
+ * we need to check other locations.
+ */
+ if (env.best_cpu == -1 || (p->numa_group &&
+ nodes_weight(p->numa_group->active_nodes) > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
+ dist = node_distance(env.src_nid, env.dst_nid);
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist != env.dist) {
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ }
+
/* Only consider nodes where both task and groups benefit */
- taskimp = task_weight(p, nid) - taskweight;
- groupimp = group_weight(p, nid) - groupweight;
+ taskimp = task_weight(p, nid, dist) - taskweight;
+ groupimp = group_weight(p, nid, dist) - groupweight;
if (taskimp < 0 && groupimp < 0)
continue;
+ env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env.dst_stats, env.dst_nid);
task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1394,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
+ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
@@ -1506,7 +1646,7 @@ static void update_task_scan_period(struct task_struct *p,
* scanning faster if shared accesses dominate as it may
* simply bounce migrations uselessly
*/
- ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+ ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
}
@@ -1543,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
return delta;
}
+/*
+ * Determine the preferred nid for a task in a numa_group. This needs to
+ * be done in a way that produces consistent results with group_weight,
+ * otherwise workloads might not converge.
+ */
+static int preferred_group_nid(struct task_struct *p, int nid)
+{
+ nodemask_t nodes;
+ int dist;
+
+ /* Direct connections between all NUMA nodes. */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return nid;
+
+ /*
+ * On a system with glueless mesh NUMA topology, group_weight
+ * scores nodes according to the number of NUMA hinting faults on
+ * both the node itself, and on nearby nodes.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ unsigned long score, max_score = 0;
+ int node, max_node = nid;
+
+ dist = sched_max_numa_distance;
+
+ for_each_online_node(node) {
+ score = group_weight(p, node, dist);
+ if (score > max_score) {
+ max_score = score;
+ max_node = node;
+ }
+ }
+ return max_node;
+ }
+
+ /*
+ * Finding the preferred nid in a system with NUMA backplane
+ * interconnect topology is more involved. The goal is to locate
+ * tasks from numa_groups near each other in the system, and
+ * untangle workloads from different sides of the system. This requires
+ * searching down the hierarchy of node groups, recursively searching
+ * inside the highest scoring group of nodes. The nodemask tricks
+ * keep the complexity of the search down.
+ */
+ nodes = node_online_map;
+ for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
+ unsigned long max_faults = 0;
+ nodemask_t max_group;
+ int a, b;
+
+ /* Are there nodes at this distance from each other? */
+ if (!find_numa_distance(dist))
+ continue;
+
+ for_each_node_mask(a, nodes) {
+ unsigned long faults = 0;
+ nodemask_t this_group;
+ nodes_clear(this_group);
+
+ /* Sum group's NUMA faults; includes a==b case. */
+ for_each_node_mask(b, nodes) {
+ if (node_distance(a, b) < dist) {
+ faults += group_faults(p, b);
+ node_set(b, this_group);
+ node_clear(b, nodes);
+ }
+ }
+
+ /* Remember the top group. */
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_group = this_group;
+ /*
+ * subtle: at the smallest distance there is
+ * just one node left in each "group", the
+ * winner is the preferred nid.
+ */
+ nid = a;
+ }
+ }
+ /* Next round, evaluate the nodes within max_group. */
+ nodes = max_group;
+ }
+ return nid;
+}
+
static void task_numa_placement(struct task_struct *p)
{
int seq, nid, max_nid = -1, max_group_nid = -1;
@@ -1570,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p)
/* Find the node with the highest number of faults */
for_each_online_node(nid) {
+ /* Keep track of the offsets in numa_faults array */
+ int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
unsigned long faults = 0, group_faults = 0;
- int priv, i;
+ int priv;
for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
long diff, f_diff, f_weight;
- i = task_faults_idx(nid, priv);
+ mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
+ membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
+ cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
+ cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
/* Decay existing window, copy faults since last scan */
- diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
- fault_types[priv] += p->numa_faults_buffer_memory[i];
- p->numa_faults_buffer_memory[i] = 0;
+ diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
+ fault_types[priv] += p->numa_faults[membuf_idx];
+ p->numa_faults[membuf_idx] = 0;
/*
* Normalize the faults_from, so all tasks in a group
@@ -1591,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p)
* faults are less important.
*/
f_weight = div64_u64(runtime << 16, period + 1);
- f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
+ f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
(total_faults + 1);
- f_diff = f_weight - p->numa_faults_cpu[i] / 2;
- p->numa_faults_buffer_cpu[i] = 0;
+ f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
+ p->numa_faults[cpubuf_idx] = 0;
- p->numa_faults_memory[i] += diff;
- p->numa_faults_cpu[i] += f_diff;
- faults += p->numa_faults_memory[i];
+ p->numa_faults[mem_idx] += diff;
+ p->numa_faults[cpu_idx] += f_diff;
+ faults += p->numa_faults[mem_idx];
p->total_numa_faults += diff;
if (p->numa_group) {
- /* safe because we can only change our own group */
- p->numa_group->faults[i] += diff;
- p->numa_group->faults_cpu[i] += f_diff;
+ /*
+ * safe because we can only change our own group
+ *
+ * mem_idx represents the offset for a given
+ * nid and priv in a specific region because it
+ * is at the beginning of the numa_faults array.
+ */
+ p->numa_group->faults[mem_idx] += diff;
+ p->numa_group->faults_cpu[mem_idx] += f_diff;
p->numa_group->total_faults += diff;
- group_faults += p->numa_group->faults[i];
+ group_faults += p->numa_group->faults[mem_idx];
}
}
@@ -1625,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p)
if (p->numa_group) {
update_numa_active_node_mask(p->numa_group);
spin_unlock_irq(group_lock);
- max_nid = max_group_nid;
+ max_nid = preferred_group_nid(p, max_group_nid);
}
if (max_faults) {
@@ -1668,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
atomic_set(&grp->refcount, 1);
spin_lock_init(&grp->lock);
- INIT_LIST_HEAD(&grp->task_list);
grp->gid = p->pid;
/* Second half of the array tracks nids where faults happen */
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
@@ -1677,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
node_set(task_node(current), grp->active_nodes);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
- grp->faults[i] = p->numa_faults_memory[i];
+ grp->faults[i] = p->numa_faults[i];
grp->total_faults = p->total_numa_faults;
- list_add(&p->numa_entry, &grp->task_list);
grp->nr_tasks++;
rcu_assign_pointer(p->numa_group, grp);
}
@@ -1736,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
- my_grp->faults[i] -= p->numa_faults_memory[i];
- grp->faults[i] += p->numa_faults_memory[i];
+ my_grp->faults[i] -= p->numa_faults[i];
+ grp->faults[i] += p->numa_faults[i];
}
my_grp->total_faults -= p->total_numa_faults;
grp->total_faults += p->total_numa_faults;
- list_move(&p->numa_entry, &grp->task_list);
my_grp->nr_tasks--;
grp->nr_tasks++;
@@ -1762,27 +1996,23 @@ no_join:
void task_numa_free(struct task_struct *p)
{
struct numa_group *grp = p->numa_group;
- void *numa_faults = p->numa_faults_memory;
+ void *numa_faults = p->numa_faults;
unsigned long flags;
int i;
if (grp) {
spin_lock_irqsave(&grp->lock, flags);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
- grp->faults[i] -= p->numa_faults_memory[i];
+ grp->faults[i] -= p->numa_faults[i];
grp->total_faults -= p->total_numa_faults;
- list_del(&p->numa_entry);
grp->nr_tasks--;
spin_unlock_irqrestore(&grp->lock, flags);
- rcu_assign_pointer(p->numa_group, NULL);
+ RCU_INIT_POINTER(p->numa_group, NULL);
put_numa_group(grp);
}
- p->numa_faults_memory = NULL;
- p->numa_faults_buffer_memory = NULL;
- p->numa_faults_cpu= NULL;
- p->numa_faults_buffer_cpu = NULL;
+ p->numa_faults = NULL;
kfree(numa_faults);
}
@@ -1804,29 +2034,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (!p->mm)
return;
- /* Do not worry about placement if exiting */
- if (p->state == TASK_DEAD)
- return;
-
/* Allocate buffer to track faults on a per-node basis */
- if (unlikely(!p->numa_faults_memory)) {
- int size = sizeof(*p->numa_faults_memory) *
+ if (unlikely(!p->numa_faults)) {
+ int size = sizeof(*p->numa_faults) *
NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
- p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
- if (!p->numa_faults_memory)
+ p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
+ if (!p->numa_faults)
return;
- BUG_ON(p->numa_faults_buffer_memory);
- /*
- * The averaged statistics, shared & private, memory & cpu,
- * occupy the first half of the array. The second half of the
- * array is for current counters, which are averaged into the
- * first set by task_numa_placement.
- */
- p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
- p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
- p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
p->total_numa_faults = 0;
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
@@ -1866,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (migrated)
p->numa_pages_migrated += pages;
- p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
- p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
+ p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
+ p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
p->numa_faults_locality[local] += pages;
}
@@ -1946,7 +2162,7 @@ void task_numa_work(struct callback_head *work)
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
- if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
+ if (!vma_migratable(vma) || !vma_policy_mof(vma))
continue;
/*
@@ -2211,8 +2427,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
/*
* As y^PERIOD = 1/2, we can combine
- * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
- * With a look-up table which covers k^n (n<PERIOD)
+ * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
+ * With a look-up table which covers y^n (n<PERIOD)
*
* To achieve constant time decay_load.
*/
@@ -2377,6 +2593,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
tg_contrib -= cfs_rq->tg_load_contrib;
+ if (!tg_contrib)
+ return;
+
if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
atomic_long_add(tg_contrib, &tg->load_avg);
cfs_rq->tg_load_contrib += tg_contrib;
@@ -3786,6 +4005,10 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
+ /* init_cfs_bandwidth() was not called */
+ if (!cfs_b->throttled_cfs_rq.next)
+ return;
+
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
}
@@ -3892,14 +4115,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
resched_curr(rq);
return;
}
-
- /*
- * Don't schedule slices shorter than 10000ns, that just
- * doesn't make sense. Rely on vruntime for fairness.
- */
- if (rq->curr != p)
- delta = max_t(s64, 10000LL, delta);
-
hrtick_start(rq, delta);
}
}
@@ -4087,7 +4302,7 @@ static unsigned long capacity_of(int cpu)
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+ unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
unsigned long load_avg = rq->cfs.runnable_load_avg;
if (nr_running)
@@ -4213,7 +4428,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
* wl = S * s'_i; see (2)
*/
if (W > 0 && w < W)
- wl = (w * tg->shares) / W;
+ wl = (w * (long)tg->shares) / W;
else
wl = tg->shares;
@@ -4276,8 +4491,8 @@ static int wake_wide(struct task_struct *p)
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
s64 this_load, load;
+ s64 this_eff_load, prev_eff_load;
int idx, this_cpu, prev_cpu;
- unsigned long tl_per_task;
struct task_group *tg;
unsigned long weight;
int balanced;
@@ -4320,47 +4535,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
- if (this_load > 0) {
- s64 this_eff_load, prev_eff_load;
+ this_eff_load = 100;
+ this_eff_load *= capacity_of(prev_cpu);
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= capacity_of(this_cpu);
- this_eff_load = 100;
- this_eff_load *= capacity_of(prev_cpu);
+ if (this_load > 0) {
this_eff_load *= this_load +
effective_load(tg, this_cpu, weight, weight);
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= capacity_of(this_cpu);
prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+ }
- balanced = this_eff_load <= prev_eff_load;
- } else
- balanced = true;
-
- /*
- * If the currently running task will sleep within
- * a reasonable amount of time then attract this newly
- * woken task:
- */
- if (sync && balanced)
- return 1;
+ balanced = this_eff_load <= prev_eff_load;
schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
- tl_per_task = cpu_avg_load_per_task(this_cpu);
- if (balanced ||
- (this_load <= load &&
- this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
- /*
- * This domain has SD_WAKE_AFFINE and
- * p is cache cold in this domain, and
- * there is no bad imbalance.
- */
- schedstat_inc(sd, ttwu_move_affine);
- schedstat_inc(p, se.statistics.nr_wakeups_affine);
+ if (!balanced)
+ return 0;
- return 1;
- }
- return 0;
+ schedstat_inc(sd, ttwu_move_affine);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine);
+
+ return 1;
}
/*
@@ -4428,20 +4626,46 @@ static int
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
- int idlest = -1;
+ unsigned int min_exit_latency = UINT_MAX;
+ u64 latest_idle_timestamp = 0;
+ int least_loaded_cpu = this_cpu;
+ int shallowest_idle_cpu = -1;
int i;
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
- load = weighted_cpuload(i);
-
- if (load < min_load || (load == min_load && i == this_cpu)) {
- min_load = load;
- idlest = i;
+ if (idle_cpu(i)) {
+ struct rq *rq = cpu_rq(i);
+ struct cpuidle_state *idle = idle_get_state(rq);
+ if (idle && idle->exit_latency < min_exit_latency) {
+ /*
+ * We give priority to a CPU whose idle state
+ * has the smallest exit latency irrespective
+ * of any idle timestamp.
+ */
+ min_exit_latency = idle->exit_latency;
+ latest_idle_timestamp = rq->idle_stamp;
+ shallowest_idle_cpu = i;
+ } else if ((!idle || idle->exit_latency == min_exit_latency) &&
+ rq->idle_stamp > latest_idle_timestamp) {
+ /*
+ * If equal or no active idle state, then
+ * the most recently idled CPU might have
+ * a warmer cache.
+ */
+ latest_idle_timestamp = rq->idle_stamp;
+ shallowest_idle_cpu = i;
+ }
+ } else if (shallowest_idle_cpu == -1) {
+ load = weighted_cpuload(i);
+ if (load < min_load || (load == min_load && i == this_cpu)) {
+ min_load = load;
+ least_loaded_cpu = i;
+ }
}
}
- return idlest;
+ return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
/*
@@ -4510,14 +4734,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
- if (p->nr_cpus_allowed == 1)
- return prev_cpu;
-
- if (sd_flag & SD_BALANCE_WAKE) {
- if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
- want_affine = 1;
- new_cpu = prev_cpu;
- }
+ if (sd_flag & SD_BALANCE_WAKE)
+ want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
rcu_read_lock();
for_each_domain(cpu, tmp) {
@@ -4704,7 +4922,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
/*
- * This is possible from callers such as move_task(), in which we
+ * This is possible from callers such as attach_tasks(), in which we
* unconditionally check_prempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
@@ -5112,27 +5330,18 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
+ struct list_head tasks;
};
/*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
- */
-static void move_task(struct task_struct *p, struct lb_env *env)
-{
- deactivate_task(env->src_rq, p, 0);
- set_task_cpu(p, env->dst_cpu);
- activate_task(env->dst_rq, p, 0);
- check_preempt_curr(env->dst_rq, p, 0);
-}
-
-/*
* Is this task likely cache-hot:
*/
static int task_hot(struct task_struct *p, struct lb_env *env)
{
s64 delta;
+ lockdep_assert_held(&env->src_rq->lock);
+
if (p->sched_class != &fair_sched_class)
return 0;
@@ -5164,7 +5373,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
- if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
+ if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
!(env->sd->flags & SD_NUMA)) {
return false;
}
@@ -5203,7 +5412,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
return false;
- if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
+ if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
return false;
src_nid = cpu_to_node(env->src_cpu);
@@ -5252,6 +5461,9 @@ static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot = 0;
+
+ lockdep_assert_held(&env->src_rq->lock);
+
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
@@ -5310,24 +5522,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (!tsk_cache_hot)
tsk_cache_hot = migrate_degrades_locality(p, env);
- if (migrate_improves_locality(p, env)) {
-#ifdef CONFIG_SCHEDSTATS
+ if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
+ env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations);
}
-#endif
- return 1;
- }
-
- if (!tsk_cache_hot ||
- env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-
- if (tsk_cache_hot) {
- schedstat_inc(env->sd, lb_hot_gained[env->idle]);
- schedstat_inc(p, se.statistics.nr_forced_migrations);
- }
-
return 1;
}
@@ -5336,47 +5536,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
}
/*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * detach_task() -- detach the task for the migration specified in env
+ */
+static void detach_task(struct task_struct *p, struct lb_env *env)
+{
+ lockdep_assert_held(&env->src_rq->lock);
+
+ deactivate_task(env->src_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ set_task_cpu(p, env->dst_cpu);
+}
+
+/*
+ * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
* part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
*
- * Called with both runqueues locked.
+ * Returns a task if successful and NULL otherwise.
*/
-static int move_one_task(struct lb_env *env)
+static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p, *n;
+ lockdep_assert_held(&env->src_rq->lock);
+
list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
if (!can_migrate_task(p, env))
continue;
- move_task(p, env);
+ detach_task(p, env);
+
/*
- * Right now, this is only the second place move_task()
- * is called, so we can safely collect move_task()
- * stats here rather than inside move_task().
+ * Right now, this is only the second place where
+ * lb_gained[env->idle] is updated (other is detach_tasks)
+ * so we can safely collect stats here rather than
+ * inside detach_tasks().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
- return 1;
+ return p;
}
- return 0;
+ return NULL;
}
static const unsigned int sched_nr_migrate_break = 32;
/*
- * move_tasks tries to move up to imbalance weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
+ * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * busiest_rq, as part of a balancing operation within domain "sd".
*
- * Called with both runqueues locked.
+ * Returns number of detached tasks if successful and 0 otherwise.
*/
-static int move_tasks(struct lb_env *env)
+static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
struct task_struct *p;
unsigned long load;
- int pulled = 0;
+ int detached = 0;
+
+ lockdep_assert_held(&env->src_rq->lock);
if (env->imbalance <= 0)
return 0;
@@ -5407,14 +5623,16 @@ static int move_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance)
goto next;
- move_task(p, env);
- pulled++;
+ detach_task(p, env);
+ list_add(&p->se.group_node, &env->tasks);
+
+ detached++;
env->imbalance -= load;
#ifdef CONFIG_PREEMPT
/*
* NEWIDLE balancing is a source of latency, so preemptible
- * kernels will stop after the first task is pulled to minimize
+ * kernels will stop after the first task is detached to minimize
* the critical section.
*/
if (env->idle == CPU_NEWLY_IDLE)
@@ -5434,13 +5652,58 @@ next:
}
/*
- * Right now, this is one of only two places move_task() is called,
- * so we can safely collect move_task() stats here rather than
- * inside move_task().
+ * Right now, this is one of only two places we collect this stat
+ * so we can safely collect detach_one_task() stats here rather
+ * than inside detach_one_task().
*/
- schedstat_add(env->sd, lb_gained[env->idle], pulled);
+ schedstat_add(env->sd, lb_gained[env->idle], detached);
+
+ return detached;
+}
+
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static void attach_task(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_held(&rq->lock);
- return pulled;
+ BUG_ON(task_rq(p) != rq);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ activate_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+}
+
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+ raw_spin_lock(&rq->lock);
+ attach_task(rq, p);
+ raw_spin_unlock(&rq->lock);
+}
+
+/*
+ * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
+ * new rq.
+ */
+static void attach_tasks(struct lb_env *env)
+{
+ struct list_head *tasks = &env->tasks;
+ struct task_struct *p;
+
+ raw_spin_lock(&env->dst_rq->lock);
+
+ while (!list_empty(tasks)) {
+ p = list_first_entry(tasks, struct task_struct, se.group_node);
+ list_del_init(&p->se.group_node);
+
+ attach_task(env->dst_rq, p);
+ }
+
+ raw_spin_unlock(&env->dst_rq->lock);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -5559,6 +5822,13 @@ static unsigned long task_h_load(struct task_struct *p)
#endif
/********** Helpers for find_busiest_group ************************/
+
+enum group_type {
+ group_other = 0,
+ group_imbalanced,
+ group_overloaded,
+};
+
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
@@ -5572,7 +5842,7 @@ struct sg_lb_stats {
unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
- int group_imb; /* Is there an imbalance in the group ? */
+ enum group_type group_type;
int group_has_free_capacity;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -5610,6 +5880,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
.total_capacity = 0UL,
.busiest_stat = {
.avg_load = 0UL,
+ .sum_nr_running = 0,
+ .group_type = group_other,
},
};
}
@@ -5652,19 +5924,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
return default_scale_capacity(sd, cpu);
}
-static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long weight = sd->span_weight;
- unsigned long smt_gain = sd->smt_gain;
-
- smt_gain /= weight;
+ if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
+ return sd->smt_gain / sd->span_weight;
- return smt_gain;
+ return SCHED_CAPACITY_SCALE;
}
-unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
- return default_scale_smt_capacity(sd, cpu);
+ return default_scale_cpu_capacity(sd, cpu);
}
static unsigned long scale_rt_capacity(int cpu)
@@ -5703,18 +5973,15 @@ static unsigned long scale_rt_capacity(int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long weight = sd->span_weight;
unsigned long capacity = SCHED_CAPACITY_SCALE;
struct sched_group *sdg = sd->groups;
- if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
- if (sched_feat(ARCH_CAPACITY))
- capacity *= arch_scale_smt_capacity(sd, cpu);
- else
- capacity *= default_scale_smt_capacity(sd, cpu);
+ if (sched_feat(ARCH_CAPACITY))
+ capacity *= arch_scale_cpu_capacity(sd, cpu);
+ else
+ capacity *= default_scale_cpu_capacity(sd, cpu);
- capacity >>= SCHED_CAPACITY_SHIFT;
- }
+ capacity >>= SCHED_CAPACITY_SHIFT;
sdg->sgc->capacity_orig = capacity;
@@ -5891,6 +6158,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
return capacity_factor;
}
+static enum group_type
+group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running > sgs->group_capacity_factor)
+ return group_overloaded;
+
+ if (sg_imbalanced(group))
+ return group_imbalanced;
+
+ return group_other;
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -5920,7 +6199,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
load = source_load(i, load_idx);
sgs->group_load += load;
- sgs->sum_nr_running += rq->nr_running;
+ sgs->sum_nr_running += rq->cfs.h_nr_running;
if (rq->nr_running > 1)
*overload = true;
@@ -5942,9 +6221,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
-
- sgs->group_imb = sg_imbalanced(group);
sgs->group_capacity_factor = sg_capacity_factor(env, group);
+ sgs->group_type = group_classify(group, sgs);
if (sgs->group_capacity_factor > sgs->sum_nr_running)
sgs->group_has_free_capacity = 1;
@@ -5968,13 +6246,19 @@ static bool update_sd_pick_busiest(struct lb_env *env,
struct sched_group *sg,
struct sg_lb_stats *sgs)
{
- if (sgs->avg_load <= sds->busiest_stat.avg_load)
- return false;
+ struct sg_lb_stats *busiest = &sds->busiest_stat;
- if (sgs->sum_nr_running > sgs->group_capacity_factor)
+ if (sgs->group_type > busiest->group_type)
return true;
- if (sgs->group_imb)
+ if (sgs->group_type < busiest->group_type)
+ return false;
+
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+
+ /* This is the busiest node in its class. */
+ if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
/*
@@ -5982,8 +6266,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* numbered CPUs in the group, therefore mark all groups
* higher than ourself as busy.
*/
- if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
- env->dst_cpu < group_first_cpu(sg)) {
+ if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
if (!sds->busiest)
return true;
@@ -6073,8 +6356,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
* with a large weight task outweighs the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- sds->local_stat.group_has_free_capacity)
+ sds->local_stat.group_has_free_capacity) {
sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
+ sgs->group_type = group_classify(sg, sgs);
+ }
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -6228,7 +6513,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
local = &sds->local_stat;
busiest = &sds->busiest_stat;
- if (busiest->group_imb) {
+ if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
* to ensure cpu-load equilibrium, look at wider averages. XXX
@@ -6248,12 +6533,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
return fix_small_imbalance(env, sds);
}
- if (!busiest->group_imb) {
- /*
- * Don't want to pull so many tasks that a group would go idle.
- * Except of course for the group_imb case, since then we might
- * have to drop below capacity to reach cpu-load equilibrium.
- */
+ /*
+ * If there aren't any idle cpus, avoid creating some.
+ */
+ if (busiest->group_type == group_overloaded &&
+ local->group_type == group_overloaded) {
load_above_capacity =
(busiest->sum_nr_running - busiest->group_capacity_factor);
@@ -6337,7 +6621,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* work because they assume all things are equal, which typically
* isn't true due to cpus_allowed constraints and the like.
*/
- if (busiest->group_imb)
+ if (busiest->group_type == group_imbalanced)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
@@ -6346,7 +6630,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
/*
- * If the local group is more busy than the selected busiest group
+ * If the local group is busier than the selected busiest group
* don't try and pull any tasks.
*/
if (local->avg_load >= busiest->avg_load)
@@ -6361,13 +6645,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (env->idle == CPU_IDLE) {
/*
- * This cpu is idle. If the busiest group load doesn't
- * have more tasks than the number of available cpu's and
- * there is no imbalance between this and busiest group
- * wrt to idle cpu's, it is balanced.
+ * This cpu is idle. If the busiest group is not overloaded
+ * and there is no imbalance between this and busiest group
+ * wrt idle cpus, it is balanced. The imbalance becomes
+ * significant if the diff is greater than 1 otherwise we
+ * might end up to just move the imbalance on another group
*/
- if ((local->idle_cpus < busiest->idle_cpus) &&
- busiest->sum_nr_running <= busiest->group_weight)
+ if ((busiest->group_type != group_overloaded) &&
+ (local->idle_cpus <= (busiest->idle_cpus + 1)))
goto out_balanced;
} else {
/*
@@ -6539,7 +6824,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
- struct cpumask *cpus = __get_cpu_var(load_balance_mask);
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
struct lb_env env = {
.sd = sd,
@@ -6550,6 +6835,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
+ .tasks = LIST_HEAD_INIT(env.tasks),
};
/*
@@ -6599,23 +6885,30 @@ redo:
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
- local_irq_save(flags);
- double_rq_lock(env.dst_rq, busiest);
+ raw_spin_lock_irqsave(&busiest->lock, flags);
/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
- cur_ld_moved = move_tasks(&env);
- ld_moved += cur_ld_moved;
- double_rq_unlock(env.dst_rq, busiest);
- local_irq_restore(flags);
+ cur_ld_moved = detach_tasks(&env);
/*
- * some other cpu did the load balance for us.
+ * We've detached some tasks from busiest_rq. Every
+ * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
+ * unlock busiest->lock, and we are able to be sure
+ * that nobody can manipulate the tasks in parallel.
+ * See task_rq_lock() family for the details.
*/
- if (cur_ld_moved && env.dst_cpu != smp_processor_id())
- resched_cpu(env.dst_cpu);
+
+ raw_spin_unlock(&busiest->lock);
+
+ if (cur_ld_moved) {
+ attach_tasks(&env);
+ ld_moved += cur_ld_moved;
+ }
+
+ local_irq_restore(flags);
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
@@ -6665,10 +6958,8 @@ more_balance:
if (sd_parent) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
- if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
*group_imbalance = 1;
- } else if (*group_imbalance)
- *group_imbalance = 0;
}
/* All tasks on this runqueue were pinned by CPU affinity */
@@ -6679,7 +6970,7 @@ more_balance:
env.loop_break = sched_nr_migrate_break;
goto redo;
}
- goto out_balanced;
+ goto out_all_pinned;
}
}
@@ -6744,7 +7035,7 @@ more_balance:
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
- * move_tasks).
+ * detach_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
@@ -6753,6 +7044,23 @@ more_balance:
goto out;
out_balanced:
+ /*
+ * We reach balance although we may have faced some affinity
+ * constraints. Clear the imbalance flag if it was set.
+ */
+ if (sd_parent) {
+ int *group_imbalance = &sd_parent->groups->sgc->imbalance;
+
+ if (*group_imbalance)
+ *group_imbalance = 0;
+ }
+
+out_all_pinned:
+ /*
+ * We reach balance because all tasks are pinned at this level so
+ * we can't migrate them. Let the imbalance flag set so parent level
+ * can try to migrate them.
+ */
schedstat_inc(sd, lb_balanced[idle]);
sd->nr_balance_failed = 0;
@@ -6914,6 +7222,7 @@ static int active_load_balance_cpu_stop(void *data)
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
+ struct task_struct *p = NULL;
raw_spin_lock_irq(&busiest_rq->lock);
@@ -6933,9 +7242,6 @@ static int active_load_balance_cpu_stop(void *data)
*/
BUG_ON(busiest_rq == target_rq);
- /* move a task from busiest_rq to target_rq */
- double_lock_balance(busiest_rq, target_rq);
-
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
@@ -6956,16 +7262,22 @@ static int active_load_balance_cpu_stop(void *data)
schedstat_inc(sd, alb_count);
- if (move_one_task(&env))
+ p = detach_one_task(&env);
+ if (p)
schedstat_inc(sd, alb_pushed);
else
schedstat_inc(sd, alb_failed);
}
rcu_read_unlock();
- double_unlock_balance(busiest_rq, target_rq);
out_unlock:
busiest_rq->active_balance = 0;
- raw_spin_unlock_irq(&busiest_rq->lock);
+ raw_spin_unlock(&busiest_rq->lock);
+
+ if (p)
+ attach_one_task(target_rq, p);
+
+ local_irq_enable();
+
return 0;
}
@@ -7465,7 +7777,7 @@ static void task_fork_fair(struct task_struct *p)
static void
prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (!p->se.on_rq)
+ if (!task_on_rq_queued(p))
return;
/*
@@ -7490,11 +7802,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
* switched back to the fair class the enqueue_entity(.flags=0) will
* do the right thing.
*
- * If it's on_rq, then the dequeue_entity(.flags=0) will already
- * have normalized the vruntime, if it's !on_rq, then only when
+ * If it's queued, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it's !queued, then only when
* the task is sleeping will it still have non-normalized vruntime.
*/
- if (!p->on_rq && p->state != TASK_RUNNING) {
+ if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
@@ -7521,15 +7833,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
*/
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
- struct sched_entity *se = &p->se;
#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *se = &p->se;
/*
* Since the real-depth could have been changed (only FAIR
* class maintain depth value), reset depth properly.
*/
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif
- if (!se->on_rq)
+ if (!task_on_rq_queued(p))
return;
/*
@@ -7575,7 +7887,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int queued)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq;
@@ -7594,7 +7906,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* fair sleeper stuff for the first placement, but who cares.
*/
/*
- * When !on_rq, vruntime of the task has usually NOT been normalized.
+ * When !queued, vruntime of the task has usually NOT been normalized.
* But there are some cases where it has already been normalized:
*
* - Moving a forked child which is waiting for being woken up by
@@ -7605,14 +7917,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* To prevent boost or penalty in the new cfs_rq caused by delta
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
*/
- if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
- on_rq = 1;
+ if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
+ queued = 1;
- if (!on_rq)
+ if (!queued)
se->vruntime -= cfs_rq_of(se)->min_vruntime;
set_task_rq(p, task_cpu(p));
se->depth = se->parent ? se->parent->depth + 1 : 0;
- if (!on_rq) {
+ if (!queued) {
cfs_rq = cfs_rq_of(se);
se->vruntime += cfs_rq->min_vruntime;
#ifdef CONFIG_SMP
@@ -7835,6 +8147,8 @@ const struct sched_class fair_sched_class = {
.get_rr_interval = get_rr_interval_fair,
+ .update_curr = update_curr_fair,
+
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_move_group = task_move_group_fair,
#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 11e7bc434f43..c47fce75e666 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -147,6 +147,9 @@ use_default:
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
goto use_default;
+ /* Take note of the planned idle state. */
+ idle_set_state(this_rq(), &drv->states[next_state]);
+
/*
* Enter the idle state previously returned by the governor decision.
* This function will block until an interrupt occurs and will take
@@ -154,6 +157,9 @@ use_default:
*/
entered_state = cpuidle_enter(drv, dev, next_state);
+ /* The cpu is no longer idle or about to enter idle. */
+ idle_set_state(this_rq(), NULL);
+
if (broadcast)
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 67ad4e7f506a..c65dac8c97cd 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -75,6 +75,10 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
return 0;
}
+static void update_curr_idle(struct rq *rq)
+{
+}
+
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
@@ -101,4 +105,5 @@ const struct sched_class idle_sched_class = {
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
+ .update_curr = update_curr_idle,
};
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5f6edca4fafd..ee15f5a0d1c1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1301,9 +1301,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
struct task_struct *curr;
struct rq *rq;
- if (p->nr_cpus_allowed == 1)
- goto out;
-
/* For anything but wake ups, just return the task_cpu */
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
goto out;
@@ -1351,16 +1348,22 @@ out:
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- if (rq->curr->nr_cpus_allowed == 1)
+ /*
+ * Current can't be migrated, useless to reschedule,
+ * let's hope p can move out.
+ */
+ if (rq->curr->nr_cpus_allowed == 1 ||
+ !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
return;
+ /*
+ * p is migratable, so let's not schedule it and
+ * see if it is pushed or pulled somewhere else.
+ */
if (p->nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;
- if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
- return;
-
/*
* There appears to be other cpus that can accept
* current and none to run 'p', so lets reschedule
@@ -1448,7 +1451,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
* means a dl or stop task can slip in, in which case we need
* to re-start task selection.
*/
- if (unlikely((rq->stop && rq->stop->on_rq) ||
+ if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
rq->dl.dl_nr_running))
return RETRY_TASK;
}
@@ -1468,8 +1471,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
p = _pick_next_task_rt(rq);
/* The running task is never eligible for pushing */
- if (p)
- dequeue_pushable_task(rq, p);
+ dequeue_pushable_task(rq, p);
set_post_schedule(rq);
@@ -1526,7 +1528,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
static int find_lowest_rq(struct task_struct *task)
{
struct sched_domain *sd;
- struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
+ struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
@@ -1624,7 +1626,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
!cpumask_test_cpu(lowest_rq->cpu,
tsk_cpus_allowed(task)) ||
task_running(rq, task) ||
- !task->on_rq)) {
+ !task_on_rq_queued(task))) {
double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1658,7 +1660,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(task_current(rq, p));
BUG_ON(p->nr_cpus_allowed <= 1);
- BUG_ON(!p->on_rq);
+ BUG_ON(!task_on_rq_queued(p));
BUG_ON(!rt_task(p));
return p;
@@ -1809,7 +1811,7 @@ static int pull_rt_task(struct rq *this_rq)
*/
if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
WARN_ON(p == src_rq->curr);
- WARN_ON(!p->on_rq);
+ WARN_ON(!task_on_rq_queued(p));
/*
* There's a chance that p is higher in priority
@@ -1870,7 +1872,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
BUG_ON(!rt_task(p));
- if (!p->on_rq)
+ if (!task_on_rq_queued(p))
return;
weight = cpumask_weight(new_mask);
@@ -1936,7 +1938,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
* we may need to handle the pulling of RT tasks
* now.
*/
- if (!p->on_rq || rq->rt.rt_nr_running)
+ if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
return;
if (pull_rt_task(rq))
@@ -1970,7 +1972,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
* If that current running task is also an RT task
* then see if we can move to another run queue.
*/
- if (p->on_rq && rq->curr != p) {
+ if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
/* Don't resched if we changed runqueues */
@@ -1989,7 +1991,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
static void
prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (!p->on_rq)
+ if (!task_on_rq_queued(p))
return;
if (rq->curr == p) {
@@ -2073,7 +2075,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
for_each_sched_rt_entity(rt_se) {
if (rt_se->run_list.prev != rt_se->run_list.next) {
requeue_task_rt(rq, p, 0);
- set_tsk_need_resched(p);
+ resched_curr(rq);
return;
}
}
@@ -2129,6 +2131,8 @@ const struct sched_class rt_sched_class = {
.prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
+
+ .update_curr = update_curr_rt,
};
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 579712f4e9d5..9a2a45c970e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -14,6 +14,11 @@
#include "cpuacct.h"
struct rq;
+struct cpuidle_state;
+
+/* task_struct::on_rq states: */
+#define TASK_ON_RQ_QUEUED 1
+#define TASK_ON_RQ_MIGRATING 2
extern __read_mostly int scheduler_running;
@@ -126,6 +131,9 @@ struct rt_bandwidth {
u64 rt_runtime;
struct hrtimer rt_period_timer;
};
+
+void __dl_clear_params(struct task_struct *p);
+
/*
* To keep the bandwidth of -deadline tasks and groups under control
* we need some place where:
@@ -168,6 +176,25 @@ struct dl_bw {
u64 bw, total_bw;
};
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw -= tsk_bw;
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw += tsk_bw;
+}
+
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
+
extern struct mutex sched_domains_mutex;
#ifdef CONFIG_CGROUP_SCHED
@@ -184,7 +211,7 @@ struct cfs_bandwidth {
raw_spinlock_t lock;
ktime_t period;
u64 quota, runtime;
- s64 hierarchal_quota;
+ s64 hierarchical_quota;
u64 runtime_expires;
int idle, timer_active;
@@ -636,6 +663,11 @@ struct rq {
#ifdef CONFIG_SMP
struct llist_head wake_list;
#endif
+
+#ifdef CONFIG_CPU_IDLE
+ /* Must be inspected within a rcu lock section */
+ struct cpuidle_state *idle_state;
+#endif
};
static inline int cpu_of(struct rq *rq)
@@ -647,13 +679,13 @@ static inline int cpu_of(struct rq *rq)
#endif
}
-DECLARE_PER_CPU(struct rq, runqueues);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
-#define this_rq() (&__get_cpu_var(runqueues))
+#define this_rq() this_cpu_ptr(&runqueues)
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-#define raw_rq() (&__raw_get_cpu_var(runqueues))
+#define raw_rq() raw_cpu_ptr(&runqueues)
static inline u64 rq_clock(struct rq *rq)
{
@@ -665,7 +697,25 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
}
+#ifdef CONFIG_NUMA
+enum numa_topology_type {
+ NUMA_DIRECT,
+ NUMA_GLUELESS_MESH,
+ NUMA_BACKPLANE,
+};
+extern enum numa_topology_type sched_numa_topology_type;
+extern int sched_max_numa_distance;
+extern bool find_numa_distance(int distance);
+#endif
+
#ifdef CONFIG_NUMA_BALANCING
+/* The regions in numa_faults array from task_struct */
+enum numa_faults_stats {
+ NUMA_MEM = 0,
+ NUMA_CPU,
+ NUMA_MEMBUF,
+ NUMA_CPUBUF
+};
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *, struct task_struct *);
@@ -942,6 +992,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
#endif
}
+static inline int task_on_rq_queued(struct task_struct *p)
+{
+ return p->on_rq == TASK_ON_RQ_QUEUED;
+}
+
+static inline int task_on_rq_migrating(struct task_struct *p)
+{
+ return p->on_rq == TASK_ON_RQ_MIGRATING;
+}
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
@@ -953,7 +1012,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
# define finish_arch_post_lock_switch() do { } while (0)
#endif
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
#ifdef CONFIG_SMP
@@ -991,35 +1049,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
raw_spin_unlock_irq(&rq->lock);
}
-#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
- /*
- * We can optimise this out completely for !SMP, because the
- * SMP rebalancing from interrupt is the only thing that cares
- * here.
- */
- next->on_cpu = 1;
-#endif
- raw_spin_unlock(&rq->lock);
-}
-
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
- /*
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
- * We must ensure this doesn't happen until the switch is completely
- * finished.
- */
- smp_wmb();
- prev->on_cpu = 0;
-#endif
- local_irq_enable();
-}
-#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
-
/*
* wake flags
*/
@@ -1135,6 +1164,11 @@ struct sched_class {
void (*task_fork) (struct task_struct *p);
void (*task_dead) (struct task_struct *p);
+ /*
+ * The switched_from() call is allowed to drop rq->lock, therefore we
+ * cannot assume the switched_from/switched_to pair is serliazed by
+ * rq->lock. They are however serialized by p->pi_lock.
+ */
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -1143,6 +1177,8 @@ struct sched_class {
unsigned int (*get_rr_interval) (struct rq *rq,
struct task_struct *task);
+ void (*update_curr) (struct rq *rq);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
@@ -1180,6 +1216,30 @@ static inline void idle_exit_fair(struct rq *rq) { }
#endif
+#ifdef CONFIG_CPU_IDLE
+static inline void idle_set_state(struct rq *rq,
+ struct cpuidle_state *idle_state)
+{
+ rq->idle_state = idle_state;
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+ WARN_ON(!rcu_read_lock_held());
+ return rq->idle_state;
+}
+#else
+static inline void idle_set_state(struct rq *rq,
+ struct cpuidle_state *idle_state)
+{
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+ return NULL;
+}
+#endif
+
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
extern void update_max_interval(void);
@@ -1486,6 +1546,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
+extern void print_dl_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index bfe0edadbfbb..79ffec45a6ac 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
{
struct task_struct *stop = rq->stop;
- if (!stop || !stop->on_rq)
+ if (!stop || !task_on_rq_queued(stop))
return NULL;
put_prev_task(rq, prev);
@@ -102,6 +102,10 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
return 0;
}
+static void update_curr_stop(struct rq *rq)
+{
+}
+
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
@@ -128,4 +132,5 @@ const struct sched_class stop_sched_class = {
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
+ .update_curr = update_curr_stop,
};
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 15cab1a4f84e..852143a79f36 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -9,6 +9,7 @@
#include <linux/mm.h>
#include <linux/wait.h>
#include <linux/hash.h>
+#include <linux/kthread.h>
void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
{
@@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *
}
EXPORT_SYMBOL(autoremove_wake_function);
+static inline bool is_kthread_should_stop(void)
+{
+ return (current->flags & PF_KTHREAD) && kthread_should_stop();
+}
+
+/*
+ * DEFINE_WAIT_FUNC(wait, woken_wake_func);
+ *
+ * add_wait_queue(&wq, &wait);
+ * for (;;) {
+ * if (condition)
+ * break;
+ *
+ * p->state = mode; condition = true;
+ * smp_mb(); // A smp_wmb(); // C
+ * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN;
+ * schedule() try_to_wake_up();
+ * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~
+ * wait->flags &= ~WQ_FLAG_WOKEN; condition = true;
+ * smp_mb() // B smp_wmb(); // C
+ * wait->flags |= WQ_FLAG_WOKEN;
+ * }
+ * remove_wait_queue(&wq, &wait);
+ *
+ */
+long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
+{
+ set_current_state(mode); /* A */
+ /*
+ * The above implies an smp_mb(), which matches with the smp_wmb() from
+ * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
+ * also observe all state before the wakeup.
+ */
+ if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+ timeout = schedule_timeout(timeout);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * The below implies an smp_mb(), it too pairs with the smp_wmb() from
+ * woken_wake_function() such that we must either observe the wait
+ * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
+ * an event.
+ */
+ set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+
+ return timeout;
+}
+EXPORT_SYMBOL(wait_woken);
+
+int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ /*
+ * Although this function is called under waitqueue lock, LOCK
+ * doesn't imply write barrier and the users expects write
+ * barrier semantics on wakeup functions. The following
+ * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+ * and is paired with set_mb() in wait_woken().
+ */
+ smp_wmb(); /* C */
+ wait->flags |= WQ_FLAG_WOKEN;
+
+ return default_wake_function(wait, mode, sync, key);
+}
+EXPORT_SYMBOL(woken_wake_function);
+
int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
{
struct wait_bit_key *key = arg;
@@ -343,6 +409,18 @@ int __sched out_of_line_wait_on_bit(void *word, int bit,
}
EXPORT_SYMBOL(out_of_line_wait_on_bit);
+int __sched out_of_line_wait_on_bit_timeout(
+ void *word, int bit, wait_bit_action_f *action,
+ unsigned mode, unsigned long timeout)
+{
+ wait_queue_head_t *wq = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wait, word, bit);
+
+ wait.key.timeout = jiffies + timeout;
+ return __wait_on_bit(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
+
int __sched
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
wait_bit_action_f *action, unsigned mode)
@@ -520,3 +598,27 @@ __sched int bit_wait_io(struct wait_bit_key *word)
return 0;
}
EXPORT_SYMBOL(bit_wait_io);
+
+__sched int bit_wait_timeout(struct wait_bit_key *word)
+{
+ unsigned long now = ACCESS_ONCE(jiffies);
+ if (signal_pending_state(current->state, current))
+ return 1;
+ if (time_after_eq(now, word->timeout))
+ return -EAGAIN;
+ schedule_timeout(word->timeout - now);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_timeout);
+
+__sched int bit_wait_io_timeout(struct wait_bit_key *word)
+{
+ unsigned long now = ACCESS_ONCE(jiffies);
+ if (signal_pending_state(current->state, current))
+ return 1;
+ if (time_after_eq(now, word->timeout))
+ return -EAGAIN;
+ io_schedule_timeout(word->timeout - now);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 44eb005c6695..4ef9687ac115 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -21,10 +21,11 @@
#include <linux/slab.h>
#include <linux/syscalls.h>
-/* #define SECCOMP_DEBUG 1 */
+#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+#include <asm/syscall.h>
+#endif
#ifdef CONFIG_SECCOMP_FILTER
-#include <asm/syscall.h>
#include <linux/filter.h>
#include <linux/pid.h>
#include <linux/ptrace.h>
@@ -172,10 +173,10 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
*
* Returns valid seccomp BPF response codes.
*/
-static u32 seccomp_run_filters(int syscall)
+static u32 seccomp_run_filters(struct seccomp_data *sd)
{
struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
- struct seccomp_data sd;
+ struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
/* Ensure unexpected behavior doesn't result in failing open. */
@@ -185,14 +186,17 @@ static u32 seccomp_run_filters(int syscall)
/* Make sure cross-thread synced filter points somewhere sane. */
smp_read_barrier_depends();
- populate_seccomp_data(&sd);
+ if (!sd) {
+ populate_seccomp_data(&sd_local);
+ sd = &sd_local;
+ }
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
for (; f; f = f->prev) {
- u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd);
+ u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
ret = cur_ret;
@@ -395,16 +399,15 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
if (!filter)
goto free_prog;
- filter->prog = kzalloc(bpf_prog_size(new_len),
- GFP_KERNEL|__GFP_NOWARN);
+ filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN);
if (!filter->prog)
goto free_filter;
ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
if (ret)
goto free_filter_prog;
- kfree(fp);
+ kfree(fp);
atomic_set(&filter->usage, 1);
filter->prog->len = new_len;
@@ -413,7 +416,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
return filter;
free_filter_prog:
- kfree(filter->prog);
+ __bpf_prog_free(filter->prog);
free_filter:
kfree(filter);
free_prog:
@@ -564,11 +567,55 @@ static int mode1_syscalls_32[] = {
};
#endif
-int __secure_computing(int this_syscall)
+static void __secure_computing_strict(int this_syscall)
+{
+ int *syscall_whitelist = mode1_syscalls;
+#ifdef CONFIG_COMPAT
+ if (is_compat_task())
+ syscall_whitelist = mode1_syscalls_32;
+#endif
+ do {
+ if (*syscall_whitelist == this_syscall)
+ return;
+ } while (*++syscall_whitelist);
+
+#ifdef SECCOMP_DEBUG
+ dump_stack();
+#endif
+ audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
+ do_exit(SIGKILL);
+}
+
+#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+void secure_computing_strict(int this_syscall)
+{
+ int mode = current->seccomp.mode;
+
+ if (mode == 0)
+ return;
+ else if (mode == SECCOMP_MODE_STRICT)
+ __secure_computing_strict(this_syscall);
+ else
+ BUG();
+}
+#else
+int __secure_computing(void)
+{
+ u32 phase1_result = seccomp_phase1(NULL);
+
+ if (likely(phase1_result == SECCOMP_PHASE1_OK))
+ return 0;
+ else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
+ return -1;
+ else
+ return seccomp_phase2(phase1_result);
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
+static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
{
- int exit_sig = 0;
- int *syscall;
- u32 ret;
+ u32 filter_ret, action;
+ int data;
/*
* Make sure that any changes to mode from another thread have
@@ -576,85 +623,127 @@ int __secure_computing(int this_syscall)
*/
rmb();
- switch (current->seccomp.mode) {
- case SECCOMP_MODE_STRICT:
- syscall = mode1_syscalls;
-#ifdef CONFIG_COMPAT
- if (is_compat_task())
- syscall = mode1_syscalls_32;
+ filter_ret = seccomp_run_filters(sd);
+ data = filter_ret & SECCOMP_RET_DATA;
+ action = filter_ret & SECCOMP_RET_ACTION;
+
+ switch (action) {
+ case SECCOMP_RET_ERRNO:
+ /* Set the low-order 16-bits as a errno. */
+ syscall_set_return_value(current, task_pt_regs(current),
+ -data, 0);
+ goto skip;
+
+ case SECCOMP_RET_TRAP:
+ /* Show the handler the original registers. */
+ syscall_rollback(current, task_pt_regs(current));
+ /* Let the filter pass back 16 bits of data. */
+ seccomp_send_sigsys(this_syscall, data);
+ goto skip;
+
+ case SECCOMP_RET_TRACE:
+ return filter_ret; /* Save the rest for phase 2. */
+
+ case SECCOMP_RET_ALLOW:
+ return SECCOMP_PHASE1_OK;
+
+ case SECCOMP_RET_KILL:
+ default:
+ audit_seccomp(this_syscall, SIGSYS, action);
+ do_exit(SIGSYS);
+ }
+
+ unreachable();
+
+skip:
+ audit_seccomp(this_syscall, 0, action);
+ return SECCOMP_PHASE1_SKIP;
+}
#endif
- do {
- if (*syscall == this_syscall)
- return 0;
- } while (*++syscall);
- exit_sig = SIGKILL;
- ret = SECCOMP_RET_KILL;
- break;
+
+/**
+ * seccomp_phase1() - run fast path seccomp checks on the current syscall
+ * @arg sd: The seccomp_data or NULL
+ *
+ * This only reads pt_regs via the syscall_xyz helpers. The only change
+ * it will make to pt_regs is via syscall_set_return_value, and it will
+ * only do that if it returns SECCOMP_PHASE1_SKIP.
+ *
+ * If sd is provided, it will not read pt_regs at all.
+ *
+ * It may also call do_exit or force a signal; these actions must be
+ * safe.
+ *
+ * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
+ * be processed normally.
+ *
+ * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
+ * invoked. In this case, seccomp_phase1 will have set the return value
+ * using syscall_set_return_value.
+ *
+ * If it returns anything else, then the return value should be passed
+ * to seccomp_phase2 from a context in which ptrace hooks are safe.
+ */
+u32 seccomp_phase1(struct seccomp_data *sd)
+{
+ int mode = current->seccomp.mode;
+ int this_syscall = sd ? sd->nr :
+ syscall_get_nr(current, task_pt_regs(current));
+
+ switch (mode) {
+ case SECCOMP_MODE_STRICT:
+ __secure_computing_strict(this_syscall); /* may call do_exit */
+ return SECCOMP_PHASE1_OK;
#ifdef CONFIG_SECCOMP_FILTER
- case SECCOMP_MODE_FILTER: {
- int data;
- struct pt_regs *regs = task_pt_regs(current);
- ret = seccomp_run_filters(this_syscall);
- data = ret & SECCOMP_RET_DATA;
- ret &= SECCOMP_RET_ACTION;
- switch (ret) {
- case SECCOMP_RET_ERRNO:
- /* Set the low-order 16-bits as a errno. */
- syscall_set_return_value(current, regs,
- -data, 0);
- goto skip;
- case SECCOMP_RET_TRAP:
- /* Show the handler the original registers. */
- syscall_rollback(current, regs);
- /* Let the filter pass back 16 bits of data. */
- seccomp_send_sigsys(this_syscall, data);
- goto skip;
- case SECCOMP_RET_TRACE:
- /* Skip these calls if there is no tracer. */
- if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
- syscall_set_return_value(current, regs,
- -ENOSYS, 0);
- goto skip;
- }
- /* Allow the BPF to provide the event message */
- ptrace_event(PTRACE_EVENT_SECCOMP, data);
- /*
- * The delivery of a fatal signal during event
- * notification may silently skip tracer notification.
- * Terminating the task now avoids executing a system
- * call that may not be intended.
- */
- if (fatal_signal_pending(current))
- break;
- if (syscall_get_nr(current, regs) < 0)
- goto skip; /* Explicit request to skip. */
-
- return 0;
- case SECCOMP_RET_ALLOW:
- return 0;
- case SECCOMP_RET_KILL:
- default:
- break;
- }
- exit_sig = SIGSYS;
- break;
- }
+ case SECCOMP_MODE_FILTER:
+ return __seccomp_phase1_filter(this_syscall, sd);
#endif
default:
BUG();
}
+}
-#ifdef SECCOMP_DEBUG
- dump_stack();
-#endif
- audit_seccomp(this_syscall, exit_sig, ret);
- do_exit(exit_sig);
-#ifdef CONFIG_SECCOMP_FILTER
-skip:
- audit_seccomp(this_syscall, exit_sig, ret);
-#endif
- return -1;
+/**
+ * seccomp_phase2() - finish slow path seccomp work for the current syscall
+ * @phase1_result: The return value from seccomp_phase1()
+ *
+ * This must be called from a context in which ptrace hooks can be used.
+ *
+ * Returns 0 if the syscall should be processed or -1 to skip the syscall.
+ */
+int seccomp_phase2(u32 phase1_result)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ u32 action = phase1_result & SECCOMP_RET_ACTION;
+ int data = phase1_result & SECCOMP_RET_DATA;
+
+ BUG_ON(action != SECCOMP_RET_TRACE);
+
+ audit_seccomp(syscall_get_nr(current, regs), 0, action);
+
+ /* Skip these calls if there is no tracer. */
+ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+ syscall_set_return_value(current, regs,
+ -ENOSYS, 0);
+ return -1;
+ }
+
+ /* Allow the BPF to provide the event message */
+ ptrace_event(PTRACE_EVENT_SECCOMP, data);
+ /*
+ * The delivery of a fatal signal during event
+ * notification may silently skip tracer notification.
+ * Terminating the task now avoids executing a system
+ * call that may not be intended.
+ */
+ if (fatal_signal_pending(current))
+ do_exit(SIGSYS);
+ if (syscall_get_nr(current, regs) < 0)
+ return -1; /* Explicit request to skip. */
+
+ return 0;
}
+#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
long prctl_get_seccomp(void)
{
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f0876f9f6dd..16a305295256 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1275,7 +1275,17 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
local_irq_restore(*flags);
break;
}
-
+ /*
+ * This sighand can be already freed and even reused, but
+ * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which
+ * initializes ->siglock: this slab can't go away, it has
+ * the same object type, ->siglock can't be reinitialized.
+ *
+ * We need to ensure that tsk->sighand is still the same
+ * after we take the lock, we can race with de_thread() or
+ * __exit_signal(). In the latter case the next iteration
+ * must see ->sighand == NULL.
+ */
spin_lock(&sighand->siglock);
if (likely(sighand == tsk->sighand)) {
rcu_read_unlock();
@@ -1331,23 +1341,21 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
int error = -ESRCH;
struct task_struct *p;
- rcu_read_lock();
-retry:
- p = pid_task(pid, PIDTYPE_PID);
- if (p) {
- error = group_send_sig_info(sig, info, p);
- if (unlikely(error == -ESRCH))
- /*
- * The task was unhashed in between, try again.
- * If it is dead, pid_task() will return NULL,
- * if we race with de_thread() it will find the
- * new leader.
- */
- goto retry;
- }
- rcu_read_unlock();
+ for (;;) {
+ rcu_read_lock();
+ p = pid_task(pid, PIDTYPE_PID);
+ if (p)
+ error = group_send_sig_info(sig, info, p);
+ rcu_read_unlock();
+ if (likely(!p || error != -ESRCH))
+ return error;
- return error;
+ /*
+ * The task was unhashed in between, try again. If it
+ * is dead, pid_task() will return NULL, if we race with
+ * de_thread() it will find the new leader.
+ */
+ }
}
int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
@@ -2748,6 +2756,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
#endif
+#ifdef SEGV_BNDERR
+ err |= __put_user(from->si_lower, &to->si_lower);
+ err |= __put_user(from->si_upper, &to->si_upper);
+#endif
break;
case __SI_CHLD:
err |= __put_user(from->si_pid, &to->si_pid);
diff --git a/kernel/smp.c b/kernel/smp.c
index aff8aa14f547..f38a1e692259 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/cpu.h>
+#include <linux/sched.h>
#include "smpboot.h"
@@ -164,7 +165,7 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
if (!csd) {
csd = &csd_stack;
if (!wait)
- csd = &__get_cpu_var(csd_data);
+ csd = this_cpu_ptr(&csd_data);
}
csd_lock(csd);
@@ -229,7 +230,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
WARN_ON(!irqs_disabled());
- head = &__get_cpu_var(call_single_queue);
+ head = this_cpu_ptr(&call_single_queue);
entry = llist_del_all(head);
entry = llist_reverse_order(entry);
@@ -419,7 +420,7 @@ void smp_call_function_many(const struct cpumask *mask,
return;
}
- cfd = &__get_cpu_var(cfd_data);
+ cfd = this_cpu_ptr(&cfd_data);
cpumask_and(cfd->cpumask, mask, cpu_online_mask);
cpumask_clear_cpu(this_cpu, cfd->cpumask);
@@ -699,3 +700,24 @@ void kick_all_cpus_sync(void)
smp_call_function(do_nothing, NULL, 1);
}
EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
+
+/**
+ * wake_up_all_idle_cpus - break all cpus out of idle
+ * wake_up_all_idle_cpus try to break all cpus which is in idle state even
+ * including idle polling cpus, for non-idle cpus, we will do nothing
+ * for them.
+ */
+void wake_up_all_idle_cpus(void)
+{
+ int cpu;
+
+ preempt_disable();
+ for_each_online_cpu(cpu) {
+ if (cpu == smp_processor_id())
+ continue;
+
+ wake_up_if_idle(cpu);
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index eb89e1807408..f032fb5284e3 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data)
set_current_state(TASK_INTERRUPTIBLE);
preempt_disable();
if (kthread_should_stop()) {
- set_current_state(TASK_RUNNING);
+ __set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->cleanup)
ht->cleanup(td->cpu, cpu_online(td->cpu));
@@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data)
/* Check for state change setup */
switch (td->status) {
case HP_THREAD_NONE:
+ __set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->setup)
ht->setup(td->cpu);
td->status = HP_THREAD_ACTIVE;
- preempt_disable();
- break;
+ continue;
+
case HP_THREAD_PARKED:
+ __set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->unpark)
ht->unpark(td->cpu);
td->status = HP_THREAD_ACTIVE;
- preempt_disable();
- break;
+ continue;
}
if (!ht->thread_should_run(td->cpu)) {
- preempt_enable();
+ preempt_enable_no_resched();
schedule();
} else {
- set_current_state(TASK_RUNNING);
+ __set_current_state(TASK_RUNNING);
preempt_enable();
ht->thread_fn(td->cpu);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5918d227730f..501baa9ac1be 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -278,7 +278,7 @@ restart:
pending >>= softirq_bit;
}
- rcu_bh_qs(smp_processor_id());
+ rcu_bh_qs();
local_irq_disable();
pending = local_softirq_pending();
@@ -485,7 +485,7 @@ static void tasklet_action(struct softirq_action *a)
local_irq_disable();
list = __this_cpu_read(tasklet_vec.head);
__this_cpu_write(tasklet_vec.head, NULL);
- __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
+ __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
local_irq_enable();
while (list) {
@@ -521,7 +521,7 @@ static void tasklet_hi_action(struct softirq_action *a)
local_irq_disable();
list = __this_cpu_read(tasklet_hi_vec.head);
__this_cpu_write(tasklet_hi_vec.head, NULL);
- __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
+ __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
local_irq_enable();
while (list) {
@@ -656,7 +656,7 @@ static void run_ksoftirqd(unsigned int cpu)
* in the task stack here.
*/
__do_softirq();
- rcu_note_context_switch(cpu);
+ rcu_note_context_switch();
local_irq_enable();
cond_resched();
return;
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 00fe55cc5a82..b6e4c16377c7 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
}
EXPORT_SYMBOL_GPL(print_stack_trace);
+int snprint_stack_trace(char *buf, size_t size,
+ struct stack_trace *trace, int spaces)
+{
+ int i;
+ unsigned long ip;
+ int generated;
+ int total = 0;
+
+ if (WARN_ON(!trace->entries))
+ return 0;
+
+ for (i = 0; i < trace->nr_entries; i++) {
+ ip = trace->entries[i];
+ generated = snprintf(buf, size, "%*c[<%p>] %pS\n",
+ 1 + spaces, ' ', (void *) ip, (void *) ip);
+
+ total += generated;
+
+ /* Assume that generated isn't a negative number */
+ if (generated >= size) {
+ buf += size;
+ size = 0;
+ } else {
+ buf += generated;
+ size -= generated;
+ }
+ }
+
+ return total;
+}
+EXPORT_SYMBOL_GPL(snprint_stack_trace);
+
/*
* Architectures that do not implement save_stack_trace_tsk or
* save_stack_trace_regs get this weak alias and a once-per-bootup warning
diff --git a/kernel/sys.c b/kernel/sys.c
index ce8129192a26..a8c9f5a7dda6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -62,28 +62,28 @@
#include <asm/unistd.h>
#ifndef SET_UNALIGN_CTL
-# define SET_UNALIGN_CTL(a,b) (-EINVAL)
+# define SET_UNALIGN_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_UNALIGN_CTL
-# define GET_UNALIGN_CTL(a,b) (-EINVAL)
+# define GET_UNALIGN_CTL(a, b) (-EINVAL)
#endif
#ifndef SET_FPEMU_CTL
-# define SET_FPEMU_CTL(a,b) (-EINVAL)
+# define SET_FPEMU_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_FPEMU_CTL
-# define GET_FPEMU_CTL(a,b) (-EINVAL)
+# define GET_FPEMU_CTL(a, b) (-EINVAL)
#endif
#ifndef SET_FPEXC_CTL
-# define SET_FPEXC_CTL(a,b) (-EINVAL)
+# define SET_FPEXC_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_FPEXC_CTL
-# define GET_FPEXC_CTL(a,b) (-EINVAL)
+# define GET_FPEXC_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_ENDIAN
-# define GET_ENDIAN(a,b) (-EINVAL)
+# define GET_ENDIAN(a, b) (-EINVAL)
#endif
#ifndef SET_ENDIAN
-# define SET_ENDIAN(a,b) (-EINVAL)
+# define SET_ENDIAN(a, b) (-EINVAL)
#endif
#ifndef GET_TSC_CTL
# define GET_TSC_CTL(a) (-EINVAL)
@@ -91,6 +91,12 @@
#ifndef SET_TSC_CTL
# define SET_TSC_CTL(a) (-EINVAL)
#endif
+#ifndef MPX_ENABLE_MANAGEMENT
+# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL)
+#endif
+#ifndef MPX_DISABLE_MANAGEMENT
+# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -182,39 +188,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
rcu_read_lock();
read_lock(&tasklist_lock);
switch (which) {
- case PRIO_PROCESS:
- if (who)
- p = find_task_by_vpid(who);
- else
- p = current;
- if (p)
- error = set_one_prio(p, niceval, error);
- break;
- case PRIO_PGRP:
- if (who)
- pgrp = find_vpid(who);
- else
- pgrp = task_pgrp(current);
- do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- error = set_one_prio(p, niceval, error);
- } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
- break;
- case PRIO_USER:
- uid = make_kuid(cred->user_ns, who);
- user = cred->user;
- if (!who)
- uid = cred->uid;
- else if (!uid_eq(uid, cred->uid) &&
- !(user = find_user(uid)))
+ case PRIO_PROCESS:
+ if (who)
+ p = find_task_by_vpid(who);
+ else
+ p = current;
+ if (p)
+ error = set_one_prio(p, niceval, error);
+ break;
+ case PRIO_PGRP:
+ if (who)
+ pgrp = find_vpid(who);
+ else
+ pgrp = task_pgrp(current);
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+ error = set_one_prio(p, niceval, error);
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
+ if (!who)
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid)) {
+ user = find_user(uid);
+ if (!user)
goto out_unlock; /* No processes for this user */
-
- do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid))
- error = set_one_prio(p, niceval, error);
- } while_each_thread(g, p);
- if (!uid_eq(uid, cred->uid))
- free_uid(user); /* For find_user() */
- break;
+ }
+ do_each_thread(g, p) {
+ if (uid_eq(task_uid(p), uid))
+ error = set_one_prio(p, niceval, error);
+ } while_each_thread(g, p);
+ if (!uid_eq(uid, cred->uid))
+ free_uid(user); /* For find_user() */
+ break;
}
out_unlock:
read_unlock(&tasklist_lock);
@@ -244,47 +251,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
rcu_read_lock();
read_lock(&tasklist_lock);
switch (which) {
- case PRIO_PROCESS:
- if (who)
- p = find_task_by_vpid(who);
- else
- p = current;
- if (p) {
+ case PRIO_PROCESS:
+ if (who)
+ p = find_task_by_vpid(who);
+ else
+ p = current;
+ if (p) {
+ niceval = nice_to_rlimit(task_nice(p));
+ if (niceval > retval)
+ retval = niceval;
+ }
+ break;
+ case PRIO_PGRP:
+ if (who)
+ pgrp = find_vpid(who);
+ else
+ pgrp = task_pgrp(current);
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+ niceval = nice_to_rlimit(task_nice(p));
+ if (niceval > retval)
+ retval = niceval;
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
+ if (!who)
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid)) {
+ user = find_user(uid);
+ if (!user)
+ goto out_unlock; /* No processes for this user */
+ }
+ do_each_thread(g, p) {
+ if (uid_eq(task_uid(p), uid)) {
niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
}
- break;
- case PRIO_PGRP:
- if (who)
- pgrp = find_vpid(who);
- else
- pgrp = task_pgrp(current);
- do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- niceval = nice_to_rlimit(task_nice(p));
- if (niceval > retval)
- retval = niceval;
- } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
- break;
- case PRIO_USER:
- uid = make_kuid(cred->user_ns, who);
- user = cred->user;
- if (!who)
- uid = cred->uid;
- else if (!uid_eq(uid, cred->uid) &&
- !(user = find_user(uid)))
- goto out_unlock; /* No processes for this user */
-
- do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid)) {
- niceval = nice_to_rlimit(task_nice(p));
- if (niceval > retval)
- retval = niceval;
- }
- } while_each_thread(g, p);
- if (!uid_eq(uid, cred->uid))
- free_uid(user); /* for find_user() */
- break;
+ } while_each_thread(g, p);
+ if (!uid_eq(uid, cred->uid))
+ free_uid(user); /* for find_user() */
+ break;
}
out_unlock:
read_unlock(&tasklist_lock);
@@ -306,7 +314,7 @@ out_unlock:
*
* The general idea is that a program which uses just setregid() will be
* 100% compatible with BSD. A program which uses just setgid() will be
- * 100% compatible with POSIX with saved IDs.
+ * 100% compatible with POSIX with saved IDs.
*
* SMP: There are not races, the GIDs are checked only by filesystem
* operations (as far as semantic preservation is concerned).
@@ -364,7 +372,7 @@ error:
}
/*
- * setgid() is implemented like SysV w/ SAVED_IDS
+ * setgid() is implemented like SysV w/ SAVED_IDS
*
* SMP: Same implicit races as above.
*/
@@ -442,7 +450,7 @@ static int set_user(struct cred *new)
*
* The general idea is that a program which uses just setreuid() will be
* 100% compatible with BSD. A program which uses just setuid() will be
- * 100% compatible with POSIX with saved IDs.
+ * 100% compatible with POSIX with saved IDs.
*/
SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
{
@@ -503,17 +511,17 @@ error:
abort_creds(new);
return retval;
}
-
+
/*
- * setuid() is implemented like SysV with SAVED_IDS
- *
+ * setuid() is implemented like SysV with SAVED_IDS
+ *
* Note that SAVED_ID's is deficient in that a setuid root program
- * like sendmail, for example, cannot set its uid to be a normal
+ * like sendmail, for example, cannot set its uid to be a normal
* user and then switch back, because if you're root, setuid() sets
* the saved uid too. If you don't like this, blame the bright people
* in the POSIX committee and/or USG. Note that the BSD-style setreuid()
* will allow a root program to temporarily drop privileges and be able to
- * regain them by swapping the real and effective uid.
+ * regain them by swapping the real and effective uid.
*/
SYSCALL_DEFINE1(setuid, uid_t, uid)
{
@@ -637,10 +645,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _
euid = from_kuid_munged(cred->user_ns, cred->euid);
suid = from_kuid_munged(cred->user_ns, cred->suid);
- if (!(retval = put_user(ruid, ruidp)) &&
- !(retval = put_user(euid, euidp)))
- retval = put_user(suid, suidp);
-
+ retval = put_user(ruid, ruidp);
+ if (!retval) {
+ retval = put_user(euid, euidp);
+ if (!retval)
+ return put_user(suid, suidp);
+ }
return retval;
}
@@ -709,9 +719,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _
egid = from_kgid_munged(cred->user_ns, cred->egid);
sgid = from_kgid_munged(cred->user_ns, cred->sgid);
- if (!(retval = put_user(rgid, rgidp)) &&
- !(retval = put_user(egid, egidp)))
- retval = put_user(sgid, sgidp);
+ retval = put_user(rgid, rgidp);
+ if (!retval) {
+ retval = put_user(egid, egidp);
+ if (!retval)
+ retval = put_user(sgid, sgidp);
+ }
return retval;
}
@@ -862,11 +875,9 @@ void do_sys_times(struct tms *tms)
{
cputime_t tgutime, tgstime, cutime, cstime;
- spin_lock_irq(&current->sighand->siglock);
thread_group_cputime_adjusted(current, &tgutime, &tgstime);
cutime = current->signal->cutime;
cstime = current->signal->cstime;
- spin_unlock_irq(&current->sighand->siglock);
tms->tms_utime = cputime_to_clock_t(tgutime);
tms->tms_stime = cputime_to_clock_t(tgstime);
tms->tms_cutime = cputime_to_clock_t(cutime);
@@ -1284,7 +1295,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
/*
* Back compatibility for getrlimit. Needed for some apps.
*/
-
SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
struct rlimit __user *, rlim)
{
@@ -1299,7 +1309,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
x.rlim_cur = 0x7FFFFFFF;
if (x.rlim_max > 0x7FFFFFFF)
x.rlim_max = 0x7FFFFFFF;
- return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
+ return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
}
#endif
@@ -1527,7 +1537,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
cputime_t tgutime, tgstime, utime, stime;
unsigned long maxrss = 0;
- memset((char *) r, 0, sizeof *r);
+ memset((char *)r, 0, sizeof (*r));
utime = stime = 0;
if (who == RUSAGE_THREAD) {
@@ -1541,41 +1551,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
return;
switch (who) {
- case RUSAGE_BOTH:
- case RUSAGE_CHILDREN:
- utime = p->signal->cutime;
- stime = p->signal->cstime;
- r->ru_nvcsw = p->signal->cnvcsw;
- r->ru_nivcsw = p->signal->cnivcsw;
- r->ru_minflt = p->signal->cmin_flt;
- r->ru_majflt = p->signal->cmaj_flt;
- r->ru_inblock = p->signal->cinblock;
- r->ru_oublock = p->signal->coublock;
- maxrss = p->signal->cmaxrss;
-
- if (who == RUSAGE_CHILDREN)
- break;
-
- case RUSAGE_SELF:
- thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- utime += tgutime;
- stime += tgstime;
- r->ru_nvcsw += p->signal->nvcsw;
- r->ru_nivcsw += p->signal->nivcsw;
- r->ru_minflt += p->signal->min_flt;
- r->ru_majflt += p->signal->maj_flt;
- r->ru_inblock += p->signal->inblock;
- r->ru_oublock += p->signal->oublock;
- if (maxrss < p->signal->maxrss)
- maxrss = p->signal->maxrss;
- t = p;
- do {
- accumulate_thread_rusage(t, r);
- } while_each_thread(p, t);
+ case RUSAGE_BOTH:
+ case RUSAGE_CHILDREN:
+ utime = p->signal->cutime;
+ stime = p->signal->cstime;
+ r->ru_nvcsw = p->signal->cnvcsw;
+ r->ru_nivcsw = p->signal->cnivcsw;
+ r->ru_minflt = p->signal->cmin_flt;
+ r->ru_majflt = p->signal->cmaj_flt;
+ r->ru_inblock = p->signal->cinblock;
+ r->ru_oublock = p->signal->coublock;
+ maxrss = p->signal->cmaxrss;
+
+ if (who == RUSAGE_CHILDREN)
break;
- default:
- BUG();
+ case RUSAGE_SELF:
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+ utime += tgutime;
+ stime += tgstime;
+ r->ru_nvcsw += p->signal->nvcsw;
+ r->ru_nivcsw += p->signal->nivcsw;
+ r->ru_minflt += p->signal->min_flt;
+ r->ru_majflt += p->signal->maj_flt;
+ r->ru_inblock += p->signal->inblock;
+ r->ru_oublock += p->signal->oublock;
+ if (maxrss < p->signal->maxrss)
+ maxrss = p->signal->maxrss;
+ t = p;
+ do {
+ accumulate_thread_rusage(t, r);
+ } while_each_thread(p, t);
+ break;
+
+ default:
+ BUG();
}
unlock_task_sighand(p, &flags);
@@ -1585,6 +1595,7 @@ out:
if (who != RUSAGE_CHILDREN) {
struct mm_struct *mm = get_task_mm(p);
+
if (mm) {
setmax_mm_hiwater_rss(&maxrss, mm);
mmput(mm);
@@ -1596,6 +1607,7 @@ out:
int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
{
struct rusage r;
+
k_getrusage(p, who, &r);
return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
}
@@ -1628,12 +1640,14 @@ SYSCALL_DEFINE1(umask, int, mask)
return mask;
}
-static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
+static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
{
struct fd exe;
struct inode *inode;
int err;
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
+
exe = fdget(fd);
if (!exe.file)
return -EBADF;
@@ -1654,8 +1668,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (err)
goto exit;
- down_write(&mm->mmap_sem);
-
/*
* Forbid mm->exe_file change if old file still mapped.
*/
@@ -1667,7 +1679,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (vma->vm_file &&
path_equal(&vma->vm_file->f_path,
&mm->exe_file->f_path))
- goto exit_unlock;
+ goto exit;
}
/*
@@ -1678,34 +1690,222 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
*/
err = -EPERM;
if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
- goto exit_unlock;
+ goto exit;
err = 0;
set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */
-exit_unlock:
- up_write(&mm->mmap_sem);
-
exit:
fdput(exe);
return err;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
+/*
+ * WARNING: we don't require any capability here so be very careful
+ * in what is allowed for modification from userspace.
+ */
+static int validate_prctl_map(struct prctl_mm_map *prctl_map)
+{
+ unsigned long mmap_max_addr = TASK_SIZE;
+ struct mm_struct *mm = current->mm;
+ int error = -EINVAL, i;
+
+ static const unsigned char offsets[] = {
+ offsetof(struct prctl_mm_map, start_code),
+ offsetof(struct prctl_mm_map, end_code),
+ offsetof(struct prctl_mm_map, start_data),
+ offsetof(struct prctl_mm_map, end_data),
+ offsetof(struct prctl_mm_map, start_brk),
+ offsetof(struct prctl_mm_map, brk),
+ offsetof(struct prctl_mm_map, start_stack),
+ offsetof(struct prctl_mm_map, arg_start),
+ offsetof(struct prctl_mm_map, arg_end),
+ offsetof(struct prctl_mm_map, env_start),
+ offsetof(struct prctl_mm_map, env_end),
+ };
+
+ /*
+ * Make sure the members are not somewhere outside
+ * of allowed address space.
+ */
+ for (i = 0; i < ARRAY_SIZE(offsets); i++) {
+ u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
+
+ if ((unsigned long)val >= mmap_max_addr ||
+ (unsigned long)val < mmap_min_addr)
+ goto out;
+ }
+
+ /*
+ * Make sure the pairs are ordered.
+ */
+#define __prctl_check_order(__m1, __op, __m2) \
+ ((unsigned long)prctl_map->__m1 __op \
+ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
+ error = __prctl_check_order(start_code, <, end_code);
+ error |= __prctl_check_order(start_data, <, end_data);
+ error |= __prctl_check_order(start_brk, <=, brk);
+ error |= __prctl_check_order(arg_start, <=, arg_end);
+ error |= __prctl_check_order(env_start, <=, env_end);
+ if (error)
+ goto out;
+#undef __prctl_check_order
+
+ error = -EINVAL;
+
+ /*
+ * @brk should be after @end_data in traditional maps.
+ */
+ if (prctl_map->start_brk <= prctl_map->end_data ||
+ prctl_map->brk <= prctl_map->end_data)
+ goto out;
+
+ /*
+ * Neither we should allow to override limits if they set.
+ */
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
+ prctl_map->start_brk, prctl_map->end_data,
+ prctl_map->start_data))
+ goto out;
+
+ /*
+ * Someone is trying to cheat the auxv vector.
+ */
+ if (prctl_map->auxv_size) {
+ if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
+ goto out;
+ }
+
+ /*
+ * Finally, make sure the caller has the rights to
+ * change /proc/pid/exe link: only local root should
+ * be allowed to.
+ */
+ if (prctl_map->exe_fd != (u32)-1) {
+ struct user_namespace *ns = current_user_ns();
+ const struct cred *cred = current_cred();
+
+ if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
+ !gid_eq(cred->gid, make_kgid(ns, 0)))
+ goto out;
+ }
+
+ error = 0;
+out:
+ return error;
+}
+
+static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
+{
+ struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
+ unsigned long user_auxv[AT_VECTOR_SIZE];
+ struct mm_struct *mm = current->mm;
+ int error;
+
+ BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+ BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
+
+ if (opt == PR_SET_MM_MAP_SIZE)
+ return put_user((unsigned int)sizeof(prctl_map),
+ (unsigned int __user *)addr);
+
+ if (data_size != sizeof(prctl_map))
+ return -EINVAL;
+
+ if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
+ return -EFAULT;
+
+ error = validate_prctl_map(&prctl_map);
+ if (error)
+ return error;
+
+ if (prctl_map.auxv_size) {
+ memset(user_auxv, 0, sizeof(user_auxv));
+ if (copy_from_user(user_auxv,
+ (const void __user *)prctl_map.auxv,
+ prctl_map.auxv_size))
+ return -EFAULT;
+
+ /* Last entry must be AT_NULL as specification requires */
+ user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
+ user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
+ }
+
+ down_write(&mm->mmap_sem);
+ if (prctl_map.exe_fd != (u32)-1)
+ error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
+ downgrade_write(&mm->mmap_sem);
+ if (error)
+ goto out;
+
+ /*
+ * We don't validate if these members are pointing to
+ * real present VMAs because application may have correspond
+ * VMAs already unmapped and kernel uses these members for statistics
+ * output in procfs mostly, except
+ *
+ * - @start_brk/@brk which are used in do_brk but kernel lookups
+ * for VMAs when updating these memvers so anything wrong written
+ * here cause kernel to swear at userspace program but won't lead
+ * to any problem in kernel itself
+ */
+
+ mm->start_code = prctl_map.start_code;
+ mm->end_code = prctl_map.end_code;
+ mm->start_data = prctl_map.start_data;
+ mm->end_data = prctl_map.end_data;
+ mm->start_brk = prctl_map.start_brk;
+ mm->brk = prctl_map.brk;
+ mm->start_stack = prctl_map.start_stack;
+ mm->arg_start = prctl_map.arg_start;
+ mm->arg_end = prctl_map.arg_end;
+ mm->env_start = prctl_map.env_start;
+ mm->env_end = prctl_map.env_end;
+
+ /*
+ * Note this update of @saved_auxv is lockless thus
+ * if someone reads this member in procfs while we're
+ * updating -- it may get partly updated results. It's
+ * known and acceptable trade off: we leave it as is to
+ * not introduce additional locks here making the kernel
+ * more complex.
+ */
+ if (prctl_map.auxv_size)
+ memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
+
+ error = 0;
+out:
+ up_read(&mm->mmap_sem);
+ return error;
+}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
static int prctl_set_mm(int opt, unsigned long addr,
unsigned long arg4, unsigned long arg5)
{
- unsigned long rlim = rlimit(RLIMIT_DATA);
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int error;
- if (arg5 || (arg4 && opt != PR_SET_MM_AUXV))
+ if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
+ opt != PR_SET_MM_MAP &&
+ opt != PR_SET_MM_MAP_SIZE)))
return -EINVAL;
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
+ return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
+#endif
+
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (opt == PR_SET_MM_EXE_FILE)
- return prctl_set_mm_exe_file(mm, (unsigned int)addr);
+ if (opt == PR_SET_MM_EXE_FILE) {
+ down_write(&mm->mmap_sem);
+ error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
+ up_write(&mm->mmap_sem);
+ return error;
+ }
if (addr >= TASK_SIZE || addr < mmap_min_addr)
return -EINVAL;
@@ -1733,9 +1933,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
if (addr <= mm->end_data)
goto out;
- if (rlim < RLIM_INFINITY &&
- (mm->brk - addr) +
- (mm->end_data - mm->start_data) > rlim)
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
+ mm->end_data, mm->start_data))
goto out;
mm->start_brk = addr;
@@ -1745,9 +1944,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
if (addr <= mm->end_data)
goto out;
- if (rlim < RLIM_INFINITY &&
- (addr - mm->start_brk) +
- (mm->end_data - mm->start_data) > rlim)
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
+ mm->end_data, mm->start_data))
goto out;
mm->brk = addr;
@@ -2011,6 +2209,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
me->mm->def_flags &= ~VM_NOHUGEPAGE;
up_write(&me->mm->mmap_sem);
break;
+ case PR_MPX_ENABLE_MANAGEMENT:
+ error = MPX_ENABLE_MANAGEMENT(me);
+ break;
+ case PR_MPX_DISABLE_MANAGEMENT:
+ error = MPX_DISABLE_MANAGEMENT(me);
+ break;
default:
error = -EINVAL;
break;
@@ -2023,6 +2227,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
{
int err = 0;
int cpu = raw_smp_processor_id();
+
if (cpup)
err |= put_user(cpu, cpup);
if (nodep)
@@ -2135,7 +2340,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
/* Check to see if any memory value is too large for 32-bit and scale
* down if needed
*/
- if ((s.totalram >> 32) || (s.totalswap >> 32)) {
+ if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
int bitcount = 0;
while (s.mem_unit < PAGE_SIZE) {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 391d4ddb6f4b..5adcb0ae3a58 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -156,6 +156,9 @@ cond_syscall(sys_process_vm_writev);
cond_syscall(compat_sys_process_vm_readv);
cond_syscall(compat_sys_process_vm_writev);
cond_syscall(sys_uselib);
+cond_syscall(sys_fadvise64);
+cond_syscall(sys_fadvise64_64);
+cond_syscall(sys_madvise);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
@@ -166,6 +169,8 @@ cond_syscall(ppc_rtas);
cond_syscall(sys_spu_run);
cond_syscall(sys_spu_create);
cond_syscall(sys_subpage_prot);
+cond_syscall(sys_s390_pci_mmio_read);
+cond_syscall(sys_s390_pci_mmio_write);
/* mmu depending weak syscall entries */
cond_syscall(sys_mprotect);
@@ -218,3 +223,9 @@ cond_syscall(sys_kcmp);
/* operate on Secure Computing state */
cond_syscall(sys_seccomp);
+
+/* access BPF programs and maps */
+cond_syscall(sys_bpf);
+
+/* execveat */
+cond_syscall(sys_execveat);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 75875a741b5e..137c7f69b264 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_numa_balancing_scan_size,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
.procname = "numa_balancing",
@@ -622,6 +623,13 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "tracepoint_printk",
+ .data = &tracepoint_printk,
+ .maxlen = sizeof(tracepoint_printk),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#endif
#ifdef CONFIG_KEXEC
{
@@ -1055,15 +1063,6 @@ static struct ctl_table kern_table[] = {
.child = key_sysctls,
},
#endif
-#ifdef CONFIG_RCU_TORTURE_TEST
- {
- .procname = "rcutorture_runnable",
- .data = &rcutorture_runnable,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
#ifdef CONFIG_PERF_EVENTS
/*
* User-space scripts rely on the existence of this file
@@ -1112,6 +1111,15 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
+ {
+ .procname = "panic_on_warn",
+ .data = &panic_on_warn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
{ }
};
@@ -1460,13 +1468,6 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
- {
- .procname = "scan_unevictable_pages",
- .data = &scan_unevictable_pages,
- .maxlen = sizeof(scan_unevictable_pages),
- .mode = 0644,
- .proc_handler = scan_unevictable_handler,
- },
#ifdef CONFIG_MEMORY_FAILURE
{
.procname = "memory_failure_early_kill",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index e4ba9a5a5ccb..7e7746a42a62 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = {
{ CTL_INT, KERN_COMPAT_LOG, "compat-log" },
{ CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
{ CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
+ { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" },
{}
};
@@ -390,7 +391,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
{ CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
{ CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
{ CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
- { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
{ CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
{ CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
{ CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 13d2f7cd65db..670fff88a961 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -459,7 +459,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
stats = nla_data(na);
memset(stats, 0, sizeof(*stats));
- rc = cgroupstats_build(stats, f.file->f_dentry);
+ rc = cgroupstats_build(stats, f.file->f_path.dentry);
if (rc < 0) {
nlmsg_free(rep_skb);
goto err;
@@ -638,7 +638,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
fill_tgid_exit(tsk);
}
- listeners = __this_cpu_ptr(&listener_array);
+ listeners = raw_cpu_ptr(&listener_array);
if (list_empty(&listeners->list))
return;
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 7347426fa68d..f622cf28628a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
obj-$(CONFIG_TIMER_STATS) += timer_stats.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
-obj-$(CONFIG_TEST_UDELAY) += udelay_test.o
+obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
$(obj)/time.o: $(obj)/timeconst.h
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9c94c19f1305..55449909f114 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
* Also omit the add if it would overflow the u64 boundary.
*/
if ((~0ULL - clc > rnd) &&
- (!ismax || evt->mult <= (1U << evt->shift)))
+ (!ismax || evt->mult <= (1ULL << evt->shift)))
clc += rnd;
do_div(clc, evt->mult);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 2e949cc9c9f1..b79f39bda7e1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -792,7 +792,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
/* Initialize mult/shift and max_idle_ns */
__clocksource_updatefreq_scale(cs, scale, freq);
- /* Add clocksource to the clcoksource list */
+ /* Add clocksource to the clocksource list */
mutex_lock(&clocksource_mutex);
clocksource_enqueue(cs);
clocksource_enqueue_watchdog(cs);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1c2fe7de2842..37e50aadd471 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -558,7 +558,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
static int hrtimer_reprogram(struct hrtimer *timer,
struct hrtimer_clock_base *base)
{
- struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
int res;
@@ -629,7 +629,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
*/
static void retrigger_next_event(void *arg)
{
- struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
if (!hrtimer_hres_active())
return;
@@ -903,7 +903,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
*/
debug_deactivate(timer);
timer_stats_hrtimer_clear_start_info(timer);
- reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
+ reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
/*
* We must preserve the CALLBACK state flag here,
* otherwise we could move the timer base in
@@ -963,7 +963,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* on dynticks target.
*/
wake_up_nohz_cpu(new_base->cpu_base->cpu);
- } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) &&
+ } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
hrtimer_reprogram(timer, new_base)) {
/*
* Only allow reprogramming if the new base is on this CPU.
@@ -1103,7 +1103,7 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
*/
ktime_t hrtimer_get_next_event(void)
{
- struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *base = cpu_base->clock_base;
ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
unsigned long flags;
@@ -1144,7 +1144,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
memset(timer, 0, sizeof(struct hrtimer));
- cpu_base = &__raw_get_cpu_var(hrtimer_bases);
+ cpu_base = raw_cpu_ptr(&hrtimer_bases);
if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
clock_id = CLOCK_MONOTONIC;
@@ -1187,7 +1187,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
struct hrtimer_cpu_base *cpu_base;
int base = hrtimer_clockid_to_base(which_clock);
- cpu_base = &__raw_get_cpu_var(hrtimer_bases);
+ cpu_base = raw_cpu_ptr(&hrtimer_bases);
*tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
return 0;
@@ -1242,7 +1242,7 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
*/
void hrtimer_interrupt(struct clock_event_device *dev)
{
- struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
ktime_t expires_next, now, entry_time, delta;
int i, retries = 0;
@@ -1376,7 +1376,7 @@ static void __hrtimer_peek_ahead_timers(void)
if (!hrtimer_hres_active())
return;
- td = &__get_cpu_var(tick_cpu_device);
+ td = this_cpu_ptr(&tick_cpu_device);
if (td && td->evtdev)
hrtimer_interrupt(td->evtdev);
}
@@ -1440,7 +1440,7 @@ void hrtimer_run_pending(void)
void hrtimer_run_queues(void)
{
struct timerqueue_node *node;
- struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *base;
int index, gettime = 1;
@@ -1679,7 +1679,7 @@ static void migrate_hrtimers(int scpu)
local_irq_disable();
old_base = &per_cpu(hrtimer_bases, scpu);
- new_base = &__get_cpu_var(hrtimer_bases);
+ new_base = this_cpu_ptr(&hrtimer_bases);
/*
* The caller is globally serialized and nobody else
* takes two locks at once, deadlock is not possible.
@@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
*/
if (!expires) {
schedule();
- __set_current_state(TASK_RUNNING);
return -EINTR;
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b8946416a5f..a16b67859e2a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
if (same_thread_group(tsk, current))
err = cpu_clock_sample(which_clock, tsk, &rtn);
} else {
- unsigned long flags;
- struct sighand_struct *sighand;
-
- /*
- * while_each_thread() is not yet entirely RCU safe,
- * keep locking the group while sampling process
- * clock for now.
- */
- sighand = lock_task_sighand(tsk, &flags);
- if (!sighand)
- return err;
-
if (tsk == current || thread_group_leader(tsk))
err = cpu_clock_sample_group(which_clock, tsk, &rtn);
-
- unlock_task_sighand(tsk, &flags);
}
if (!err)
@@ -567,7 +553,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
*sample = cputime_to_expires(cputime.utime);
break;
case CPUCLOCK_SCHED:
- *sample = cputime.sum_exec_runtime + task_delta_exec(p);
+ *sample = cputime.sum_exec_runtime;
break;
}
return 0;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 42b463ad90f2..31ea01f42e1f 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
goto out;
}
} else {
+ memset(&event.sigev_value, 0, sizeof(event.sigev_value));
event.sigev_notify = SIGEV_SIGNAL;
event.sigev_signo = SIGALRM;
event.sigev_value.sival_int = new_timer->it_id;
diff --git a/kernel/time/udelay_test.c b/kernel/time/test_udelay.c
index e622ba365a13..e622ba365a13 100644
--- a/kernel/time/udelay_test.c
+++ b/kernel/time/test_udelay.c
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 64c5990fd500..066f0ec05e48 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -554,7 +554,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
void tick_check_oneshot_broadcast_this_cpu(void)
{
if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) {
- struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
/*
* We might be in the middle of switching over from
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 0a0608edeb26..7efeedf53ebd 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -224,7 +224,7 @@ static void tick_setup_device(struct tick_device *td,
void tick_install_replacement(struct clock_event_device *newdev)
{
- struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
int cpu = smp_processor_id();
clockevents_exchange_device(td->evtdev, newdev);
@@ -374,14 +374,14 @@ void tick_shutdown(unsigned int *cpup)
void tick_suspend(void)
{
- struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
clockevents_shutdown(td->evtdev);
}
void tick_resume(void)
{
- struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
int broadcast = tick_resume_broadcast();
clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
@@ -400,4 +400,5 @@ void tick_resume(void)
void __init tick_init(void)
{
tick_broadcast_init();
+ tick_nohz_init();
}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index c19c1d84b6f3..366aeb4f2c66 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -99,6 +99,13 @@ static inline int tick_broadcast_oneshot_active(void) { return 0; }
static inline bool tick_broadcast_oneshot_available(void) { return false; }
#endif /* !TICK_ONESHOT */
+/* NO_HZ_FULL internal */
+#ifdef CONFIG_NO_HZ_FULL
+extern void tick_nohz_init(void);
+# else
+static inline void tick_nohz_init(void) { }
+#endif
+
/*
* Broadcasting support
*/
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 824109060a33..7ce740e78e1b 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -59,7 +59,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
*/
int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
{
- struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
struct clock_event_device *dev = td->evtdev;
if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f654a8a298fa..1363d58f07e9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -205,7 +205,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
*/
void __tick_nohz_full_check(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (tick_nohz_full_cpu(smp_processor_id())) {
if (ts->tick_stopped && !is_idle_task(current)) {
@@ -235,7 +235,7 @@ void tick_nohz_full_kick(void)
if (!tick_nohz_full_cpu(smp_processor_id()))
return;
- irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+ irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
}
/*
@@ -295,22 +295,12 @@ out:
/* Parse the boot-time nohz CPU list from the kernel parameters. */
static int __init tick_nohz_full_setup(char *str)
{
- int cpu;
-
alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
- alloc_bootmem_cpumask_var(&housekeeping_mask);
if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
+ free_bootmem_cpumask_var(tick_nohz_full_mask);
return 1;
}
-
- cpu = smp_processor_id();
- if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
- pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
- cpumask_clear_cpu(cpu, tick_nohz_full_mask);
- }
- cpumask_andnot(housekeeping_mask,
- cpu_possible_mask, tick_nohz_full_mask);
tick_nohz_full_running = true;
return 1;
@@ -349,18 +339,11 @@ static int tick_nohz_init_all(void)
#ifdef CONFIG_NO_HZ_FULL_ALL
if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
- pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
- return err;
- }
- if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
- pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n");
+ WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n");
return err;
}
err = 0;
cpumask_setall(tick_nohz_full_mask);
- cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
- cpumask_clear(housekeeping_mask);
- cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
tick_nohz_full_running = true;
#endif
return err;
@@ -375,6 +358,37 @@ void __init tick_nohz_init(void)
return;
}
+ if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
+ WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
+ cpumask_clear(tick_nohz_full_mask);
+ tick_nohz_full_running = false;
+ return;
+ }
+
+ /*
+ * Full dynticks uses irq work to drive the tick rescheduling on safe
+ * locking contexts. But then we need irq work to raise its own
+ * interrupts to avoid circular dependency on the tick
+ */
+ if (!arch_irq_work_has_interrupt()) {
+ pr_warning("NO_HZ: Can't run full dynticks because arch doesn't "
+ "support irq work self-IPIs\n");
+ cpumask_clear(tick_nohz_full_mask);
+ cpumask_copy(housekeeping_mask, cpu_possible_mask);
+ tick_nohz_full_running = false;
+ return;
+ }
+
+ cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
+ pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
+ cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+ }
+
+ cpumask_andnot(housekeeping_mask,
+ cpu_possible_mask, tick_nohz_full_mask);
+
for_each_cpu(cpu, tick_nohz_full_mask)
context_tracking_cpu_set(cpu);
@@ -559,7 +573,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
ktime_t last_update, expires, ret = { .tv64 = 0 };
unsigned long rcu_delta_jiffies;
- struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
u64 time_delta;
time_delta = timekeeping_max_deferment();
@@ -571,8 +585,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
last_jiffies = jiffies;
} while (read_seqretry(&jiffies_lock, seq));
- if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
- arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
+ if (rcu_needs_cpu(&rcu_delta_jiffies) ||
+ arch_needs_cpu() || irq_work_needs_cpu()) {
next_jiffies = last_jiffies + 1;
delta_jiffies = 1;
} else {
@@ -827,13 +841,12 @@ void tick_nohz_idle_enter(void)
local_irq_disable();
- ts = &__get_cpu_var(tick_cpu_sched);
+ ts = this_cpu_ptr(&tick_cpu_sched);
ts->inidle = 1;
__tick_nohz_idle_enter(ts);
local_irq_enable();
}
-EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
/**
* tick_nohz_irq_exit - update next tick event from interrupt exit
@@ -845,7 +858,7 @@ EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
*/
void tick_nohz_irq_exit(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (ts->inidle)
__tick_nohz_idle_enter(ts);
@@ -860,7 +873,7 @@ void tick_nohz_irq_exit(void)
*/
ktime_t tick_nohz_get_sleep_length(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
return ts->sleep_length;
}
@@ -938,7 +951,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
*/
void tick_nohz_idle_exit(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now;
local_irq_disable();
@@ -960,7 +973,6 @@ void tick_nohz_idle_exit(void)
local_irq_enable();
}
-EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
{
@@ -973,7 +985,7 @@ static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
*/
static void tick_nohz_handler(struct clock_event_device *dev)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
@@ -982,6 +994,10 @@ static void tick_nohz_handler(struct clock_event_device *dev)
tick_sched_do_timer(now);
tick_sched_handle(ts, regs);
+ /* No need to reprogram if we are running tickless */
+ if (unlikely(ts->tick_stopped))
+ return;
+
while (tick_nohz_reprogram(ts, now)) {
now = ktime_get();
tick_do_update_jiffies64(now);
@@ -993,7 +1009,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
*/
static void tick_nohz_switch_to_nohz(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t next;
if (!tick_nohz_enabled)
@@ -1055,7 +1071,7 @@ static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
static inline void tick_nohz_irq_enter(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now;
if (!ts->idle_active && !ts->tick_stopped)
@@ -1109,6 +1125,10 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
if (regs)
tick_sched_handle(ts, regs);
+ /* No need to reprogram if we are in idle or full dynticks mode */
+ if (unlikely(ts->tick_stopped))
+ return HRTIMER_NORESTART;
+
hrtimer_forward(timer, now, tick_period);
return HRTIMER_RESTART;
@@ -1129,7 +1149,7 @@ early_param("skew_tick", skew_tick);
*/
void tick_setup_sched_timer(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now = ktime_get();
/*
@@ -1198,7 +1218,7 @@ void tick_clock_notify(void)
*/
void tick_oneshot_notify(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
set_bit(0, &ts->check_clocks);
}
@@ -1213,7 +1233,7 @@ void tick_oneshot_notify(void)
*/
int tick_check_oneshot_change(int allow_nohz)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (!test_and_clear_bit(0, &ts->check_clocks))
return 0;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a9ae20fb0b11..6390517e77d4 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -304,7 +304,9 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
}
EXPORT_SYMBOL(timespec_trunc);
-/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
+/*
+ * mktime64 - Converts date to seconds.
+ * Converts Gregorian date to seconds since 1970-01-01 00:00:00.
* Assumes input in normal date format, i.e. 1980-12-31 23:59:59
* => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
*
@@ -314,15 +316,10 @@ EXPORT_SYMBOL(timespec_trunc);
* -year/100+year/400 terms, and add 10.]
*
* This algorithm was first published by Gauss (I think).
- *
- * WARNING: this function will overflow on 2106-02-07 06:28:16 on
- * machines where long is 32-bit! (However, as time_t is signed, we
- * will already get problems at other places on 2038-01-19 03:14:08)
*/
-unsigned long
-mktime(const unsigned int year0, const unsigned int mon0,
- const unsigned int day, const unsigned int hour,
- const unsigned int min, const unsigned int sec)
+time64_t mktime64(const unsigned int year0, const unsigned int mon0,
+ const unsigned int day, const unsigned int hour,
+ const unsigned int min, const unsigned int sec)
{
unsigned int mon = mon0, year = year0;
@@ -332,15 +329,14 @@ mktime(const unsigned int year0, const unsigned int mon0,
year -= 1;
}
- return ((((unsigned long)
+ return ((((time64_t)
(year/4 - year/100 + year/400 + 367*mon/12 + day) +
year*365 - 719499
)*24 + hour /* now have hours */
)*60 + min /* now have minutes */
)*60 + sec; /* finally seconds */
}
-
-EXPORT_SYMBOL(mktime);
+EXPORT_SYMBOL(mktime64);
/**
* set_normalized_timespec - set timespec sec and nsec parts and normalize
@@ -745,6 +741,7 @@ u64 nsecs_to_jiffies64(u64 n)
return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
#endif
}
+EXPORT_SYMBOL(nsecs_to_jiffies64);
/**
* nsecs_to_jiffies - Convert nsecs in u64 to jiffies
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ec1791fae965..6a931852082f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -417,7 +417,8 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
*/
static inline void tk_update_ktime_data(struct timekeeper *tk)
{
- s64 nsec;
+ u64 seconds;
+ u32 nsec;
/*
* The xtime based monotonic readout is:
@@ -426,13 +427,22 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
* nsec = base_mono + now();
* ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
*/
- nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
- nsec *= NSEC_PER_SEC;
- nsec += tk->wall_to_monotonic.tv_nsec;
- tk->tkr.base_mono = ns_to_ktime(nsec);
+ seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
+ nsec = (u32) tk->wall_to_monotonic.tv_nsec;
+ tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
/* Update the monotonic raw base */
tk->base_raw = timespec64_to_ktime(tk->raw_time);
+
+ /*
+ * The sum of the nanoseconds portions of xtime and
+ * wall_to_monotonic can be greater/equal one second. Take
+ * this into account before updating tk->ktime_sec.
+ */
+ nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+ if (nsec >= NSEC_PER_SEC)
+ seconds++;
+ tk->ktime_sec = seconds;
}
/* must hold timekeeper_lock */
@@ -519,9 +529,9 @@ EXPORT_SYMBOL(__getnstimeofday64);
/**
* getnstimeofday64 - Returns the time of day in a timespec64.
- * @ts: pointer to the timespec to be set
+ * @ts: pointer to the timespec64 to be set
*
- * Returns the time of day in a timespec (WARN if suspended).
+ * Returns the time of day in a timespec64 (WARN if suspended).
*/
void getnstimeofday64(struct timespec64 *ts)
{
@@ -623,7 +633,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw);
*
* The function calculates the monotonic clock from the realtime
* clock and the wall_to_monotonic offset and stores the result
- * in normalized timespec format in the variable pointed to by @ts.
+ * in normalized timespec64 format in the variable pointed to by @ts.
*/
void ktime_get_ts64(struct timespec64 *ts)
{
@@ -648,6 +658,54 @@ void ktime_get_ts64(struct timespec64 *ts)
}
EXPORT_SYMBOL_GPL(ktime_get_ts64);
+/**
+ * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
+ *
+ * Returns the seconds portion of CLOCK_MONOTONIC with a single non
+ * serialized read. tk->ktime_sec is of type 'unsigned long' so this
+ * works on both 32 and 64 bit systems. On 32 bit systems the readout
+ * covers ~136 years of uptime which should be enough to prevent
+ * premature wrap arounds.
+ */
+time64_t ktime_get_seconds(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ WARN_ON(timekeeping_suspended);
+ return tk->ktime_sec;
+}
+EXPORT_SYMBOL_GPL(ktime_get_seconds);
+
+/**
+ * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
+ *
+ * Returns the wall clock seconds since 1970. This replaces the
+ * get_seconds() interface which is not y2038 safe on 32bit systems.
+ *
+ * For 64bit systems the fast access to tk->xtime_sec is preserved. On
+ * 32bit systems the access must be protected with the sequence
+ * counter to provide "atomic" access to the 64bit tk->xtime_sec
+ * value.
+ */
+time64_t ktime_get_real_seconds(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ time64_t seconds;
+ unsigned int seq;
+
+ if (IS_ENABLED(CONFIG_64BIT))
+ return tk->xtime_sec;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ seconds = tk->xtime_sec;
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return seconds;
+}
+EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
+
#ifdef CONFIG_NTP_PPS
/**
@@ -703,18 +761,18 @@ void do_gettimeofday(struct timeval *tv)
EXPORT_SYMBOL(do_gettimeofday);
/**
- * do_settimeofday - Sets the time of day
- * @tv: pointer to the timespec variable containing the new time
+ * do_settimeofday64 - Sets the time of day.
+ * @ts: pointer to the timespec64 variable containing the new time
*
* Sets the time of day to the new time and update NTP and notify hrtimers
*/
-int do_settimeofday(const struct timespec *tv)
+int do_settimeofday64(const struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec64 ts_delta, xt, tmp;
+ struct timespec64 ts_delta, xt;
unsigned long flags;
- if (!timespec_valid_strict(tv))
+ if (!timespec64_valid_strict(ts))
return -EINVAL;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -723,13 +781,12 @@ int do_settimeofday(const struct timespec *tv)
timekeeping_forward_now(tk);
xt = tk_xtime(tk);
- ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
- ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
+ ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
+ ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
- tmp = timespec_to_timespec64(*tv);
- tk_set_xtime(tk, &tmp);
+ tk_set_xtime(tk, ts);
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -741,7 +798,7 @@ int do_settimeofday(const struct timespec *tv)
return 0;
}
-EXPORT_SYMBOL(do_settimeofday);
+EXPORT_SYMBOL(do_settimeofday64);
/**
* timekeeping_inject_offset - Adds or subtracts from the current time.
@@ -895,12 +952,12 @@ int timekeeping_notify(struct clocksource *clock)
}
/**
- * getrawmonotonic - Returns the raw monotonic time in a timespec
- * @ts: pointer to the timespec to be set
+ * getrawmonotonic64 - Returns the raw monotonic time in a timespec
+ * @ts: pointer to the timespec64 to be set
*
* Returns the raw monotonic time (completely un-modified by ntp)
*/
-void getrawmonotonic(struct timespec *ts)
+void getrawmonotonic64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 ts64;
@@ -915,9 +972,10 @@ void getrawmonotonic(struct timespec *ts)
} while (read_seqcount_retry(&tk_core.seq, seq));
timespec64_add_ns(&ts64, nsecs);
- *ts = timespec64_to_timespec(ts64);
+ *ts = ts64;
}
-EXPORT_SYMBOL(getrawmonotonic);
+EXPORT_SYMBOL(getrawmonotonic64);
+
/**
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
@@ -1068,8 +1126,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
}
/**
- * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
- * @delta: pointer to a timespec delta value
+ * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
+ * @delta: pointer to a timespec64 delta value
*
* This hook is for architectures that cannot support read_persistent_clock
* because their RTC/persistent clock is only accessible when irqs are enabled.
@@ -1077,10 +1135,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
* This function should only be called by rtc_resume(), and allows
* a suspend offset to be injected into the timekeeping values.
*/
-void timekeeping_inject_sleeptime(struct timespec *delta)
+void timekeeping_inject_sleeptime64(struct timespec64 *delta)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec64 tmp;
unsigned long flags;
/*
@@ -1095,8 +1152,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
timekeeping_forward_now(tk);
- tmp = timespec_to_timespec64(*delta);
- __timekeeping_inject_sleeptime(tk, &tmp);
+ __timekeeping_inject_sleeptime(tk, delta);
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1332,6 +1388,12 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
*
* XXX - TODO: Doc ntp_error calculation.
*/
+ if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) {
+ /* NTP adjustment caused clocksource mult overflow */
+ WARN_ON_ONCE(1);
+ return;
+ }
+
tk->tkr.mult += mult_adj;
tk->xtime_interval += interval;
tk->tkr.xtime_nsec -= offset;
@@ -1397,7 +1459,8 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
}
if (unlikely(tk->tkr.clock->maxadj &&
- (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) {
+ (abs(tk->tkr.mult - tk->tkr.clock->mult)
+ > tk->tkr.clock->maxadj))) {
printk_once(KERN_WARNING
"Adjusting %s more than 11%% (%ld vs %ld)\n",
tk->tkr.clock->name, (long)tk->tkr.mult,
@@ -1646,7 +1709,7 @@ struct timespec current_kernel_time(void)
}
EXPORT_SYMBOL(current_kernel_time);
-struct timespec get_monotonic_coarse(void)
+struct timespec64 get_monotonic_coarse64(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 now, mono;
@@ -1662,7 +1725,7 @@ struct timespec get_monotonic_coarse(void)
set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
now.tv_nsec + mono.tv_nsec);
- return timespec64_to_timespec(now);
+ return now;
}
/*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index aca5dfe2fa3d..2d3f5c504939 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -655,7 +655,7 @@ static inline void debug_assert_init(struct timer_list *timer)
static void do_init_timer(struct timer_list *timer, unsigned int flags,
const char *name, struct lock_class_key *key)
{
- struct tvec_base *base = __raw_get_cpu_var(tvec_bases);
+ struct tvec_base *base = raw_cpu_read(tvec_bases);
timer->entry.next = NULL;
timer->base = (void *)((unsigned long)base | flags);
@@ -1377,15 +1377,14 @@ unsigned long get_next_timer_interrupt(unsigned long now)
void update_process_times(int user_tick)
{
struct task_struct *p = current;
- int cpu = smp_processor_id();
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
- rcu_check_callbacks(cpu, user_tick);
+ rcu_check_callbacks(user_tick);
#ifdef CONFIG_IRQ_WORK
if (in_irq())
- irq_work_run();
+ irq_work_tick();
#endif
scheduler_tick();
run_posix_cpu_timers(p);
diff --git a/kernel/torture.c b/kernel/torture.c
index d600af21f022..dd70993c266c 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -211,18 +211,16 @@ EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
/*
* Print online/offline testing statistics.
*/
-char *torture_onoff_stats(char *page)
+void torture_onoff_stats(void)
{
#ifdef CONFIG_HOTPLUG_CPU
- page += sprintf(page,
- "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
- n_online_successes, n_online_attempts,
- n_offline_successes, n_offline_attempts,
- min_online, max_online,
- min_offline, max_offline,
- sum_online, sum_offline, HZ);
+ pr_cont("onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
+ n_online_successes, n_online_attempts,
+ n_offline_successes, n_offline_attempts,
+ min_online, max_online,
+ min_offline, max_offline,
+ sum_online, sum_offline, HZ);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
- return page;
}
EXPORT_SYMBOL_GPL(torture_onoff_stats);
@@ -635,8 +633,13 @@ EXPORT_SYMBOL_GPL(torture_init_end);
*
* This must be called before the caller starts shutting down its own
* kthreads.
+ *
+ * Both torture_cleanup_begin() and torture_cleanup_end() must be paired,
+ * in order to correctly perform the cleanup. They are separated because
+ * threads can still need to reference the torture_type type, thus nullify
+ * only after completing all other relevant calls.
*/
-bool torture_cleanup(void)
+bool torture_cleanup_begin(void)
{
mutex_lock(&fullstop_mutex);
if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
@@ -651,12 +654,17 @@ bool torture_cleanup(void)
torture_shuffle_cleanup();
torture_stutter_cleanup();
torture_onoff_cleanup();
+ return false;
+}
+EXPORT_SYMBOL_GPL(torture_cleanup_begin);
+
+void torture_cleanup_end(void)
+{
mutex_lock(&fullstop_mutex);
torture_type = NULL;
mutex_unlock(&fullstop_mutex);
- return false;
}
-EXPORT_SYMBOL_GPL(torture_cleanup);
+EXPORT_SYMBOL_GPL(torture_cleanup_end);
/*
* Is it time for the current torture test to stop?
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 67d6369ddf83..979ccde26720 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -55,7 +55,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
-ifeq ($(CONFIG_PM_RUNTIME),y)
+ifeq ($(CONFIG_PM),y)
obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
endif
ifeq ($(CONFIG_TRACING),y)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c1bd4ada2a04..483cecfa5c17 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1142,9 +1142,9 @@ static void get_pdu_remap(const struct trace_entry *ent,
r->sector_from = be64_to_cpu(sector_from);
}
-typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
-static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
+static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
{
char rwbs[RWBS_LEN];
unsigned long long ts = iter->ts;
@@ -1154,33 +1154,33 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
fill_rwbs(rwbs, t);
- return trace_seq_printf(&iter->seq,
- "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
- MAJOR(t->device), MINOR(t->device), iter->cpu,
- secs, nsec_rem, iter->ent->pid, act, rwbs);
+ trace_seq_printf(&iter->seq,
+ "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), iter->cpu,
+ secs, nsec_rem, iter->ent->pid, act, rwbs);
}
-static int blk_log_action(struct trace_iterator *iter, const char *act)
+static void blk_log_action(struct trace_iterator *iter, const char *act)
{
char rwbs[RWBS_LEN];
const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
- return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
- MAJOR(t->device), MINOR(t->device), act, rwbs);
+ trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), act, rwbs);
}
-static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
{
const unsigned char *pdu_buf;
int pdu_len;
- int i, end, ret;
+ int i, end;
pdu_buf = pdu_start(ent);
pdu_len = te_blk_io_trace(ent)->pdu_len;
if (!pdu_len)
- return 1;
+ return;
/* find the last zero that needs to be printed */
for (end = pdu_len - 1; end >= 0; end--)
@@ -1188,119 +1188,107 @@ static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
break;
end++;
- if (!trace_seq_putc(s, '('))
- return 0;
+ trace_seq_putc(s, '(');
for (i = 0; i < pdu_len; i++) {
- ret = trace_seq_printf(s, "%s%02x",
- i == 0 ? "" : " ", pdu_buf[i]);
- if (!ret)
- return ret;
+ trace_seq_printf(s, "%s%02x",
+ i == 0 ? "" : " ", pdu_buf[i]);
/*
* stop when the rest is just zeroes and indicate so
* with a ".." appended
*/
- if (i == end && end != pdu_len - 1)
- return trace_seq_puts(s, " ..) ");
+ if (i == end && end != pdu_len - 1) {
+ trace_seq_puts(s, " ..) ");
+ return;
+ }
}
- return trace_seq_puts(s, ") ");
+ trace_seq_puts(s, ") ");
}
-static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
- int ret;
-
- ret = trace_seq_printf(s, "%u ", t_bytes(ent));
- if (!ret)
- return 0;
- ret = blk_log_dump_pdu(s, ent);
- if (!ret)
- return 0;
- return trace_seq_printf(s, "[%s]\n", cmd);
+ trace_seq_printf(s, "%u ", t_bytes(ent));
+ blk_log_dump_pdu(s, ent);
+ trace_seq_printf(s, "[%s]\n", cmd);
} else {
if (t_sec(ent))
- return trace_seq_printf(s, "%llu + %u [%s]\n",
+ trace_seq_printf(s, "%llu + %u [%s]\n",
t_sector(ent), t_sec(ent), cmd);
- return trace_seq_printf(s, "[%s]\n", cmd);
+ else
+ trace_seq_printf(s, "[%s]\n", cmd);
}
}
-static int blk_log_with_error(struct trace_seq *s,
+static void blk_log_with_error(struct trace_seq *s,
const struct trace_entry *ent)
{
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
- int ret;
-
- ret = blk_log_dump_pdu(s, ent);
- if (ret)
- return trace_seq_printf(s, "[%d]\n", t_error(ent));
- return 0;
+ blk_log_dump_pdu(s, ent);
+ trace_seq_printf(s, "[%d]\n", t_error(ent));
} else {
if (t_sec(ent))
- return trace_seq_printf(s, "%llu + %u [%d]\n",
- t_sector(ent),
- t_sec(ent), t_error(ent));
- return trace_seq_printf(s, "%llu [%d]\n",
- t_sector(ent), t_error(ent));
+ trace_seq_printf(s, "%llu + %u [%d]\n",
+ t_sector(ent),
+ t_sec(ent), t_error(ent));
+ else
+ trace_seq_printf(s, "%llu [%d]\n",
+ t_sector(ent), t_error(ent));
}
}
-static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
{
struct blk_io_trace_remap r = { .device_from = 0, };
get_pdu_remap(ent, &r);
- return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
- t_sector(ent), t_sec(ent),
- MAJOR(r.device_from), MINOR(r.device_from),
- (unsigned long long)r.sector_from);
+ trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
+ t_sector(ent), t_sec(ent),
+ MAJOR(r.device_from), MINOR(r.device_from),
+ (unsigned long long)r.sector_from);
}
-static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- return trace_seq_printf(s, "[%s]\n", cmd);
+ trace_seq_printf(s, "[%s]\n", cmd);
}
-static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
+ trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
}
-static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
- get_pdu_int(ent), cmd);
+ trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
+ get_pdu_int(ent), cmd);
}
-static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
{
- int ret;
const struct blk_io_trace *t = te_blk_io_trace(ent);
- ret = trace_seq_putmem(s, t + 1, t->pdu_len);
- if (ret)
- return trace_seq_putc(s, '\n');
- return ret;
+ trace_seq_putmem(s, t + 1, t->pdu_len);
+ trace_seq_putc(s, '\n');
}
/*
@@ -1339,7 +1327,7 @@ static void blk_tracer_reset(struct trace_array *tr)
static const struct {
const char *act[2];
- int (*print)(struct trace_seq *s, const struct trace_entry *ent);
+ void (*print)(struct trace_seq *s, const struct trace_entry *ent);
} what2act[] = {
[__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
[__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
@@ -1364,7 +1352,6 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
const struct blk_io_trace *t;
u16 what;
- int ret;
bool long_act;
blk_log_action_t *log_action;
@@ -1374,21 +1361,18 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
log_action = classic ? &blk_log_action_classic : &blk_log_action;
if (t->action == BLK_TN_MESSAGE) {
- ret = log_action(iter, long_act ? "message" : "m");
- if (ret)
- ret = blk_log_msg(s, iter->ent);
- goto out;
+ log_action(iter, long_act ? "message" : "m");
+ blk_log_msg(s, iter->ent);
}
if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
- ret = trace_seq_printf(s, "Unknown action %x\n", what);
+ trace_seq_printf(s, "Unknown action %x\n", what);
else {
- ret = log_action(iter, what2act[what].act[long_act]);
- if (ret)
- ret = what2act[what].print(s, iter->ent);
+ log_action(iter, what2act[what].act[long_act]);
+ what2act[what].print(s, iter->ent);
}
-out:
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+
+ return trace_handle_return(s);
}
static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
@@ -1397,7 +1381,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
return print_one_line(iter, false);
}
-static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
+static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
@@ -1407,18 +1391,18 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
.time = iter->ts,
};
- if (!trace_seq_putmem(s, &old, offset))
- return 0;
- return trace_seq_putmem(s, &t->sector,
- sizeof(old) - offset + t->pdu_len);
+ trace_seq_putmem(s, &old, offset);
+ trace_seq_putmem(s, &t->sector,
+ sizeof(old) - offset + t->pdu_len);
}
static enum print_line_t
blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
- return blk_trace_synthesize_old_trace(iter) ?
- TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ blk_trace_synthesize_old_trace(iter);
+
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
@@ -1493,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (atomic_dec_and_test(&blk_probes_ref))
blk_unregister_tracepoints();
- spin_lock_irq(&running_trace_lock);
- list_del(&bt->running_list);
- spin_unlock_irq(&running_trace_lock);
blk_trace_free(bt);
return 0;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5916a8e59e87..929a733d302e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -113,6 +113,9 @@ ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
static struct ftrace_ops global_ops;
static struct ftrace_ops control_ops;
+static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs);
+
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs);
@@ -251,18 +254,24 @@ static void update_ftrace_function(void)
ftrace_func_t func;
/*
+ * Prepare the ftrace_ops that the arch callback will use.
+ * If there's only one ftrace_ops registered, the ftrace_ops_list
+ * will point to the ops we want.
+ */
+ set_function_trace_op = ftrace_ops_list;
+
+ /* If there's no ftrace_ops registered, just call the stub function */
+ if (ftrace_ops_list == &ftrace_list_end) {
+ func = ftrace_stub;
+
+ /*
* If we are at the end of the list and this ops is
* recursion safe and not dynamic and the arch supports passing ops,
* then have the mcount trampoline call the function directly.
*/
- if (ftrace_ops_list == &ftrace_list_end ||
- (ftrace_ops_list->next == &ftrace_list_end &&
- !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) &&
- (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
- !FTRACE_FORCE_LIST_FUNC)) {
- /* Set the ftrace_ops that the arch callback uses */
- set_function_trace_op = ftrace_ops_list;
- func = ftrace_ops_list->func;
+ } else if (ftrace_ops_list->next == &ftrace_list_end) {
+ func = ftrace_ops_get_func(ftrace_ops_list);
+
} else {
/* Just use the default ftrace_ops */
set_function_trace_op = &ftrace_list_end;
@@ -378,6 +387,8 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
return ret;
}
+static void ftrace_update_trampoline(struct ftrace_ops *ops);
+
static int __register_ftrace_function(struct ftrace_ops *ops)
{
if (ops->flags & FTRACE_OPS_FL_DELETED)
@@ -407,9 +418,13 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if (control_ops_alloc(ops))
return -ENOMEM;
add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
+ /* The control_ops needs the trampoline update */
+ ops = &control_ops;
} else
add_ftrace_ops(&ftrace_ops_list, ops);
+ ftrace_update_trampoline(ops);
+
if (ftrace_enabled)
update_ftrace_function();
@@ -556,13 +571,13 @@ static int function_stat_cmp(void *p1, void *p2)
static int function_stat_headers(struct seq_file *m)
{
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- seq_printf(m, " Function "
- "Hit Time Avg s^2\n"
- " -------- "
- "--- ---- --- ---\n");
+ seq_puts(m, " Function "
+ "Hit Time Avg s^2\n"
+ " -------- "
+ "--- ---- --- ---\n");
#else
- seq_printf(m, " Function Hit\n"
- " -------- ---\n");
+ seq_puts(m, " Function Hit\n"
+ " -------- ---\n");
#endif
return 0;
}
@@ -589,7 +604,7 @@ static int function_stat_show(struct seq_file *m, void *v)
seq_printf(m, " %-30.30s %10lu", str, rec->counter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- seq_printf(m, " ");
+ seq_puts(m, " ");
avg = rec->time;
do_div(avg, rec->counter);
@@ -1048,6 +1063,12 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
static struct ftrace_ops *removed_ops;
+/*
+ * Set when doing a global update, like enabling all recs or disabling them.
+ * It is not set when just updating a single ftrace_ops.
+ */
+static bool update_all_ops;
+
#ifndef CONFIG_FTRACE_MCOUNT_RECORD
# error Dynamic ftrace depends on MCOUNT_RECORD
#endif
@@ -1096,6 +1117,43 @@ static struct ftrace_ops global_ops = {
FTRACE_OPS_FL_INITIALIZED,
};
+/*
+ * This is used by __kernel_text_address() to return true if the
+ * address is on a dynamically allocated trampoline that would
+ * not return true for either core_kernel_text() or
+ * is_module_text_address().
+ */
+bool is_ftrace_trampoline(unsigned long addr)
+{
+ struct ftrace_ops *op;
+ bool ret = false;
+
+ /*
+ * Some of the ops may be dynamically allocated,
+ * they are freed after a synchronize_sched().
+ */
+ preempt_disable_notrace();
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /*
+ * This is to check for dynamically allocated trampolines.
+ * Trampolines that are in kernel text will have
+ * core_kernel_text() return true.
+ */
+ if (op->trampoline && op->trampoline_size)
+ if (addr >= op->trampoline &&
+ addr < op->trampoline + op->trampoline_size) {
+ ret = true;
+ goto out;
+ }
+ } while_for_each_ftrace_op(op);
+
+ out:
+ preempt_enable_notrace();
+
+ return ret;
+}
+
struct ftrace_page {
struct ftrace_page *next;
struct dyn_ftrace *records;
@@ -1300,6 +1358,9 @@ ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
static void
ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
+static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
+ struct ftrace_hash *new_hash);
+
static int
ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash **dst, struct ftrace_hash *src)
@@ -1307,12 +1368,16 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_func_entry *entry;
struct hlist_node *tn;
struct hlist_head *hhd;
- struct ftrace_hash *old_hash;
struct ftrace_hash *new_hash;
int size = src->count;
int bits = 0;
+ int ret;
int i;
+ /* Reject setting notrace hash on IPMODIFY ftrace_ops */
+ if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
+ return -EINVAL;
+
/*
* If the new source is empty, just free dst and assign it
* the empty_hash.
@@ -1346,21 +1411,44 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
}
update:
+ /* Make sure this can be applied if it is IPMODIFY ftrace_ops */
+ if (enable) {
+ /* IPMODIFY should be updated only when filter_hash updating */
+ ret = ftrace_hash_ipmodify_update(ops, new_hash);
+ if (ret < 0) {
+ free_ftrace_hash(new_hash);
+ return ret;
+ }
+ }
+
/*
* Remove the current set, update the hash and add
* them back.
*/
ftrace_hash_rec_disable_modify(ops, enable);
- old_hash = *dst;
rcu_assign_pointer(*dst, new_hash);
- free_ftrace_hash_rcu(old_hash);
ftrace_hash_rec_enable_modify(ops, enable);
return 0;
}
+static bool hash_contains_ip(unsigned long ip,
+ struct ftrace_ops_hash *hash)
+{
+ /*
+ * The function record is a match if it exists in the filter
+ * hash and not in the notrace hash. Note, an emty hash is
+ * considered a match for the filter hash, but an empty
+ * notrace hash is considered not in the notrace hash.
+ */
+ return (ftrace_hash_empty(hash->filter_hash) ||
+ ftrace_lookup_ip(hash->filter_hash, ip)) &&
+ (ftrace_hash_empty(hash->notrace_hash) ||
+ !ftrace_lookup_ip(hash->notrace_hash, ip));
+}
+
/*
* Test the hashes for this ops to see if we want to call
* the ops->func or not.
@@ -1376,8 +1464,7 @@ update:
static int
ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
{
- struct ftrace_hash *filter_hash;
- struct ftrace_hash *notrace_hash;
+ struct ftrace_ops_hash hash;
int ret;
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
@@ -1390,13 +1477,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
return 0;
#endif
- filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash);
- notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash);
+ hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash);
+ hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash);
- if ((ftrace_hash_empty(filter_hash) ||
- ftrace_lookup_ip(filter_hash, ip)) &&
- (ftrace_hash_empty(notrace_hash) ||
- !ftrace_lookup_ip(notrace_hash, ip)))
+ if (hash_contains_ip(ip, &hash))
ret = 1;
else
ret = 0;
@@ -1508,46 +1592,6 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
return keep_regs;
}
-static void ftrace_remove_tramp(struct ftrace_ops *ops,
- struct dyn_ftrace *rec)
-{
- /* If TRAMP is not set, no ops should have a trampoline for this */
- if (!(rec->flags & FTRACE_FL_TRAMP))
- return;
-
- rec->flags &= ~FTRACE_FL_TRAMP;
-
- if ((!ftrace_hash_empty(ops->func_hash->filter_hash) &&
- !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) ||
- ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
- return;
- /*
- * The tramp_hash entry will be removed at time
- * of update.
- */
- ops->nr_trampolines--;
-}
-
-static void ftrace_clear_tramps(struct dyn_ftrace *rec, struct ftrace_ops *ops)
-{
- struct ftrace_ops *op;
-
- /* If TRAMP is not set, no ops should have a trampoline for this */
- if (!(rec->flags & FTRACE_FL_TRAMP))
- return;
-
- do_for_each_ftrace_op(op, ftrace_ops_list) {
- /*
- * This function is called to clear other tramps
- * not the one that is being updated.
- */
- if (op == ops)
- continue;
- if (op->nr_trampolines)
- ftrace_remove_tramp(op, rec);
- } while_for_each_ftrace_op(op);
-}
-
static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
@@ -1636,18 +1680,16 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
* function, and the ops has a trampoline registered
* for it, then we can call it directly.
*/
- if (ftrace_rec_count(rec) == 1 && ops->trampoline) {
+ if (ftrace_rec_count(rec) == 1 && ops->trampoline)
rec->flags |= FTRACE_FL_TRAMP;
- ops->nr_trampolines++;
- } else {
+ else
/*
* If we are adding another function callback
* to this function, and the previous had a
* custom trampoline in use, then we need to go
* back to the default trampoline.
*/
- ftrace_clear_tramps(rec, ops);
- }
+ rec->flags &= ~FTRACE_FL_TRAMP;
/*
* If any ops wants regs saved for this function
@@ -1660,9 +1702,6 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
return;
rec->flags--;
- if (ops->trampoline && !ftrace_rec_count(rec))
- ftrace_remove_tramp(ops, rec);
-
/*
* If the rec had REGS enabled and the ops that is
* being removed had REGS set, then see if there is
@@ -1677,6 +1716,17 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
}
/*
+ * If the rec had TRAMP enabled, then it needs to
+ * be cleared. As TRAMP can only be enabled iff
+ * there is only a single ops attached to it.
+ * In otherwords, always disable it on decrementing.
+ * In the future, we may set it if rec count is
+ * decremented to one, and the ops that is left
+ * has a trampoline.
+ */
+ rec->flags &= ~FTRACE_FL_TRAMP;
+
+ /*
* flags will be cleared in ftrace_check_record()
* if rec count is zero.
*/
@@ -1735,6 +1785,114 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
ftrace_hash_rec_update_modify(ops, filter_hash, 1);
}
+/*
+ * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
+ * or no-needed to update, -EBUSY if it detects a conflict of the flag
+ * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs.
+ * Note that old_hash and new_hash has below meanings
+ * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
+ * - If the hash is EMPTY_HASH, it hits nothing
+ * - Anything else hits the recs which match the hash entries.
+ */
+static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
+ struct ftrace_hash *old_hash,
+ struct ftrace_hash *new_hash)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec, *end = NULL;
+ int in_old, in_new;
+
+ /* Only update if the ops has been registered */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return 0;
+
+ if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ return 0;
+
+ /*
+ * Since the IPMODIFY is a very address sensitive action, we do not
+ * allow ftrace_ops to set all functions to new hash.
+ */
+ if (!new_hash || !old_hash)
+ return -EINVAL;
+
+ /* Update rec->flags */
+ do_for_each_ftrace_rec(pg, rec) {
+ /* We need to update only differences of filter_hash */
+ in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
+ in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
+ if (in_old == in_new)
+ continue;
+
+ if (in_new) {
+ /* New entries must ensure no others are using it */
+ if (rec->flags & FTRACE_FL_IPMODIFY)
+ goto rollback;
+ rec->flags |= FTRACE_FL_IPMODIFY;
+ } else /* Removed entry */
+ rec->flags &= ~FTRACE_FL_IPMODIFY;
+ } while_for_each_ftrace_rec();
+
+ return 0;
+
+rollback:
+ end = rec;
+
+ /* Roll back what we did above */
+ do_for_each_ftrace_rec(pg, rec) {
+ if (rec == end)
+ goto err_out;
+
+ in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
+ in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
+ if (in_old == in_new)
+ continue;
+
+ if (in_new)
+ rec->flags &= ~FTRACE_FL_IPMODIFY;
+ else
+ rec->flags |= FTRACE_FL_IPMODIFY;
+ } while_for_each_ftrace_rec();
+
+err_out:
+ return -EBUSY;
+}
+
+static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+
+ if (ftrace_hash_empty(hash))
+ hash = NULL;
+
+ return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash);
+}
+
+/* Disabling always succeeds */
+static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+
+ if (ftrace_hash_empty(hash))
+ hash = NULL;
+
+ __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH);
+}
+
+static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
+ struct ftrace_hash *new_hash)
+{
+ struct ftrace_hash *old_hash = ops->func_hash->filter_hash;
+
+ if (ftrace_hash_empty(old_hash))
+ old_hash = NULL;
+
+ if (ftrace_hash_empty(new_hash))
+ new_hash = NULL;
+
+ return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
+}
+
static void print_ip_ins(const char *fmt, unsigned char *p)
{
int i;
@@ -1745,10 +1903,13 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
}
+static struct ftrace_ops *
+ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
+
/**
* ftrace_bug - report and shutdown function tracer
* @failed: The failed type (EFAULT, EINVAL, EPERM)
- * @ip: The address that failed
+ * @rec: The record that failed
*
* The arch code that enables or disables the function tracing
* can call ftrace_bug() when it has detected a problem in
@@ -1757,8 +1918,10 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
* EINVAL - if what is read at @ip is not what was expected
* EPERM - if the problem happens on writting to the @ip address
*/
-void ftrace_bug(int failed, unsigned long ip)
+void ftrace_bug(int failed, struct dyn_ftrace *rec)
{
+ unsigned long ip = rec ? rec->ip : 0;
+
switch (failed) {
case -EFAULT:
FTRACE_WARN_ON_ONCE(1);
@@ -1770,7 +1933,7 @@ void ftrace_bug(int failed, unsigned long ip)
pr_info("ftrace failed to modify ");
print_ip_sym(ip);
print_ip_ins(" actual: ", (unsigned char *)ip);
- printk(KERN_CONT "\n");
+ pr_cont("\n");
break;
case -EPERM:
FTRACE_WARN_ON_ONCE(1);
@@ -1782,6 +1945,24 @@ void ftrace_bug(int failed, unsigned long ip)
pr_info("ftrace faulted on unknown error ");
print_ip_sym(ip);
}
+ if (rec) {
+ struct ftrace_ops *ops = NULL;
+
+ pr_info("ftrace record flags: %lx\n", rec->flags);
+ pr_cont(" (%ld)%s", ftrace_rec_count(rec),
+ rec->flags & FTRACE_FL_REGS ? " R" : " ");
+ if (rec->flags & FTRACE_FL_TRAMP_EN) {
+ ops = ftrace_find_tramp_ops_any(rec);
+ if (ops)
+ pr_cont("\ttramp: %pS",
+ (void *)ops->trampoline);
+ else
+ pr_cont("\ttramp: ERROR!");
+
+ }
+ ip = ftrace_get_addr_curr(rec);
+ pr_cont(" expected tramp: %lx\n", ip);
+ }
}
static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
@@ -1895,21 +2076,86 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
}
static struct ftrace_ops *
+ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+ unsigned long ip = rec->ip;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+
+ if (!op->trampoline)
+ continue;
+
+ if (hash_contains_ip(ip, op->func_hash))
+ return op;
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
+static struct ftrace_ops *
ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
{
struct ftrace_ops *op;
+ unsigned long ip = rec->ip;
- /* Removed ops need to be tested first */
- if (removed_ops && removed_ops->tramp_hash) {
- if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip))
+ /*
+ * Need to check removed ops first.
+ * If they are being removed, and this rec has a tramp,
+ * and this rec is in the ops list, then it would be the
+ * one with the tramp.
+ */
+ if (removed_ops) {
+ if (hash_contains_ip(ip, &removed_ops->old_hash))
return removed_ops;
}
+ /*
+ * Need to find the current trampoline for a rec.
+ * Now, a trampoline is only attached to a rec if there
+ * was a single 'ops' attached to it. But this can be called
+ * when we are adding another op to the rec or removing the
+ * current one. Thus, if the op is being added, we can
+ * ignore it because it hasn't attached itself to the rec
+ * yet.
+ *
+ * If an ops is being modified (hooking to different functions)
+ * then we don't care about the new functions that are being
+ * added, just the old ones (that are probably being removed).
+ *
+ * If we are adding an ops to a function that already is using
+ * a trampoline, it needs to be removed (trampolines are only
+ * for single ops connected), then an ops that is not being
+ * modified also needs to be checked.
+ */
do_for_each_ftrace_op(op, ftrace_ops_list) {
- if (!op->tramp_hash)
+
+ if (!op->trampoline)
continue;
- if (ftrace_lookup_ip(op->tramp_hash, rec->ip))
+ /*
+ * If the ops is being added, it hasn't gotten to
+ * the point to be removed from this tree yet.
+ */
+ if (op->flags & FTRACE_OPS_FL_ADDING)
+ continue;
+
+
+ /*
+ * If the ops is being modified and is in the old
+ * hash, then it is probably being removed from this
+ * function.
+ */
+ if ((op->flags & FTRACE_OPS_FL_MODIFYING) &&
+ hash_contains_ip(ip, &op->old_hash))
+ return op;
+ /*
+ * If the ops is not being added or modified, and it's
+ * in its normal filter hash, then this must be the one
+ * we want!
+ */
+ if (!(op->flags & FTRACE_OPS_FL_MODIFYING) &&
+ hash_contains_ip(ip, op->func_hash))
return op;
} while_for_each_ftrace_op(op);
@@ -1921,10 +2167,11 @@ static struct ftrace_ops *
ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
{
struct ftrace_ops *op;
+ unsigned long ip = rec->ip;
do_for_each_ftrace_op(op, ftrace_ops_list) {
/* pass rec in as regs to have non-NULL val */
- if (ftrace_ops_test(op, rec->ip, rec))
+ if (hash_contains_ip(ip, op->func_hash))
return op;
} while_for_each_ftrace_op(op);
@@ -2038,7 +2285,7 @@ void __weak ftrace_replace_code(int enable)
do_for_each_ftrace_rec(pg, rec) {
failed = __ftrace_replace_code(rec, enable);
if (failed) {
- ftrace_bug(failed, rec->ip);
+ ftrace_bug(failed, rec);
/* Stop processing */
return;
}
@@ -2120,17 +2367,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
static int
ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
{
- unsigned long ip;
int ret;
- ip = rec->ip;
-
if (unlikely(ftrace_disabled))
return 0;
ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
if (ret) {
- ftrace_bug(ret, ip);
+ ftrace_bug(ret, rec);
return 0;
}
return 1;
@@ -2231,92 +2475,6 @@ void __weak arch_ftrace_update_code(int command)
ftrace_run_stop_machine(command);
}
-static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops)
-{
- struct ftrace_page *pg;
- struct dyn_ftrace *rec;
- int size, bits;
- int ret;
-
- size = ops->nr_trampolines;
- bits = 0;
- /*
- * Make the hash size about 1/2 the # found
- */
- for (size /= 2; size; size >>= 1)
- bits++;
-
- ops->tramp_hash = alloc_ftrace_hash(bits);
- /*
- * TODO: a failed allocation is going to screw up
- * the accounting of what needs to be modified
- * and not. For now, we kill ftrace if we fail
- * to allocate here. But there are ways around this,
- * but that will take a little more work.
- */
- if (!ops->tramp_hash)
- return -ENOMEM;
-
- do_for_each_ftrace_rec(pg, rec) {
- if (ftrace_rec_count(rec) == 1 &&
- ftrace_ops_test(ops, rec->ip, rec)) {
-
- /*
- * If another ops adds to a rec, the rec will
- * lose its trampoline and never get it back
- * until all ops are off of it.
- */
- if (!(rec->flags & FTRACE_FL_TRAMP))
- continue;
-
- /* This record had better have a trampoline */
- if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN)))
- return -1;
-
- ret = add_hash_entry(ops->tramp_hash, rec->ip);
- if (ret < 0)
- return ret;
- }
- } while_for_each_ftrace_rec();
-
- /* The number of recs in the hash must match nr_trampolines */
- if (FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines))
- pr_warn("count=%ld trampolines=%d\n",
- ops->tramp_hash->count,
- ops->nr_trampolines);
-
- return 0;
-}
-
-static int ftrace_save_tramp_hashes(void)
-{
- struct ftrace_ops *op;
- int ret;
-
- /*
- * Now that any trampoline is being used, we need to save the
- * hashes for the ops that have them. This allows the mapping
- * back from the record to the ops that has the trampoline to
- * know what code is being replaced. Modifying code must always
- * verify what it is changing.
- */
- do_for_each_ftrace_op(op, ftrace_ops_list) {
-
- /* The tramp_hash is recreated each time. */
- free_ftrace_hash(op->tramp_hash);
- op->tramp_hash = NULL;
-
- if (op->nr_trampolines) {
- ret = ftrace_save_ops_tramp_hash(op);
- if (ret)
- return ret;
- }
-
- } while_for_each_ftrace_op(op);
-
- return 0;
-}
-
static void ftrace_run_update_code(int command)
{
int ret;
@@ -2336,14 +2494,25 @@ static void ftrace_run_update_code(int command)
ret = ftrace_arch_code_modify_post_process();
FTRACE_WARN_ON(ret);
+}
- ret = ftrace_save_tramp_hashes();
- FTRACE_WARN_ON(ret);
+static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
+ struct ftrace_hash *old_hash)
+{
+ ops->flags |= FTRACE_OPS_FL_MODIFYING;
+ ops->old_hash.filter_hash = old_hash;
+ ftrace_run_update_code(command);
+ ops->old_hash.filter_hash = NULL;
+ ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
}
static ftrace_func_t saved_ftrace_func;
static int ftrace_start_up;
+void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+}
+
static void control_ops_free(struct ftrace_ops *ops)
{
free_percpu(ops->disabled);
@@ -2362,6 +2531,13 @@ static void ftrace_startup_enable(int command)
ftrace_run_update_code(command);
}
+static void ftrace_startup_all(int command)
+{
+ update_all_ops = true;
+ ftrace_startup_enable(command);
+ update_all_ops = false;
+}
+
static int ftrace_startup(struct ftrace_ops *ops, int command)
{
int ret;
@@ -2376,12 +2552,31 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
ftrace_start_up++;
command |= FTRACE_UPDATE_CALLS;
- ops->flags |= FTRACE_OPS_FL_ENABLED;
+ /*
+ * Note that ftrace probes uses this to start up
+ * and modify functions it will probe. But we still
+ * set the ADDING flag for modification, as probes
+ * do not have trampolines. If they add them in the
+ * future, then the probes will need to distinguish
+ * between adding and updating probes.
+ */
+ ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;
+
+ ret = ftrace_hash_ipmodify_enable(ops);
+ if (ret < 0) {
+ /* Rollback registration process */
+ __unregister_ftrace_function(ops);
+ ftrace_start_up--;
+ ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+ return ret;
+ }
ftrace_hash_rec_enable(ops, 1);
ftrace_startup_enable(command);
+ ops->flags &= ~FTRACE_OPS_FL_ADDING;
+
return 0;
}
@@ -2404,6 +2599,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
*/
WARN_ON_ONCE(ftrace_start_up < 0);
+ /* Disabling ipmodify never fails */
+ ftrace_hash_ipmodify_disable(ops);
ftrace_hash_rec_disable(ops, 1);
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
@@ -2431,11 +2628,35 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* If the ops uses a trampoline, then it needs to be
* tested first on update.
*/
+ ops->flags |= FTRACE_OPS_FL_REMOVING;
removed_ops = ops;
+ /* The trampoline logic checks the old hashes */
+ ops->old_hash.filter_hash = ops->func_hash->filter_hash;
+ ops->old_hash.notrace_hash = ops->func_hash->notrace_hash;
+
ftrace_run_update_code(command);
+ /*
+ * If there's no more ops registered with ftrace, run a
+ * sanity check to make sure all rec flags are cleared.
+ */
+ if (ftrace_ops_list == &ftrace_list_end) {
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+
+ do_for_each_ftrace_rec(pg, rec) {
+ if (FTRACE_WARN_ON_ONCE(rec->flags))
+ pr_warn(" %pS flags:%lx\n",
+ (void *)rec->ip, rec->flags);
+ } while_for_each_ftrace_rec();
+ }
+
+ ops->old_hash.filter_hash = NULL;
+ ops->old_hash.notrace_hash = NULL;
+
removed_ops = NULL;
+ ops->flags &= ~FTRACE_OPS_FL_REMOVING;
/*
* Dynamic ops may be freed, we must make sure that all
@@ -2454,6 +2675,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
schedule_on_each_cpu(ftrace_sync);
+ arch_ftrace_trampoline_free(ops);
+
if (ops->flags & FTRACE_OPS_FL_CONTROL)
control_ops_free(ops);
}
@@ -2606,7 +2829,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
if (ftrace_start_up && cnt) {
int failed = __ftrace_replace_code(p, 1);
if (failed)
- ftrace_bug(failed, p->ip);
+ ftrace_bug(failed, p);
}
}
}
@@ -2931,6 +3154,22 @@ static void t_stop(struct seq_file *m, void *p)
mutex_unlock(&ftrace_lock);
}
+void * __weak
+arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ return NULL;
+}
+
+static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops,
+ struct dyn_ftrace *rec)
+{
+ void *ptr;
+
+ ptr = arch_ftrace_trampoline_func(ops, rec);
+ if (ptr)
+ seq_printf(m, " ->%pS", ptr);
+}
+
static int t_show(struct seq_file *m, void *v)
{
struct ftrace_iterator *iter = m->private;
@@ -2941,9 +3180,9 @@ static int t_show(struct seq_file *m, void *v)
if (iter->flags & FTRACE_ITER_PRINTALL) {
if (iter->flags & FTRACE_ITER_NOTRACE)
- seq_printf(m, "#### no functions disabled ####\n");
+ seq_puts(m, "#### no functions disabled ####\n");
else
- seq_printf(m, "#### all functions enabled ####\n");
+ seq_puts(m, "#### all functions enabled ####\n");
return 0;
}
@@ -2954,22 +3193,25 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, "%ps", (void *)rec->ip);
if (iter->flags & FTRACE_ITER_ENABLED) {
- seq_printf(m, " (%ld)%s",
+ struct ftrace_ops *ops = NULL;
+
+ seq_printf(m, " (%ld)%s%s",
ftrace_rec_count(rec),
- rec->flags & FTRACE_FL_REGS ? " R" : " ");
+ rec->flags & FTRACE_FL_REGS ? " R" : " ",
+ rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
- struct ftrace_ops *ops;
-
- ops = ftrace_find_tramp_ops_curr(rec);
- if (ops && ops->trampoline)
+ ops = ftrace_find_tramp_ops_any(rec);
+ if (ops)
seq_printf(m, "\ttramp: %pS",
(void *)ops->trampoline);
else
- seq_printf(m, "\ttramp: ERROR!");
+ seq_puts(m, "\ttramp: ERROR!");
+
}
+ add_trampoline_func(m, ops, rec);
}
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
@@ -3003,9 +3245,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
{
struct ftrace_iterator *iter;
- if (unlikely(ftrace_disabled))
- return -ENODEV;
-
iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
if (iter) {
iter->pg = ftrace_pages_start;
@@ -3340,7 +3579,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
static int ftrace_probe_registered;
-static void __enable_ftrace_function_probe(void)
+static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash)
{
int ret;
int i;
@@ -3348,7 +3587,8 @@ static void __enable_ftrace_function_probe(void)
if (ftrace_probe_registered) {
/* still need to update the function call sites */
if (ftrace_enabled)
- ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+ ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
+ old_hash);
return;
}
@@ -3399,6 +3639,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
{
struct ftrace_func_probe *entry;
struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
+ struct ftrace_hash *old_hash = *orig_hash;
struct ftrace_hash *hash;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
@@ -3417,7 +3658,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
mutex_lock(&trace_probe_ops.func_hash->regex_lock);
- hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
if (!hash) {
count = -ENOMEM;
goto out;
@@ -3476,10 +3717,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
} while_for_each_ftrace_rec();
ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
- if (ret < 0)
- count = ret;
- __enable_ftrace_function_probe();
+ __enable_ftrace_function_probe(old_hash);
+
+ if (!ret)
+ free_ftrace_hash_rcu(old_hash);
+ else
+ count = ret;
out_unlock:
mutex_unlock(&ftrace_lock);
@@ -3503,6 +3747,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
struct ftrace_func_probe *entry;
struct ftrace_func_probe *p;
struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
+ struct ftrace_hash *old_hash = *orig_hash;
struct list_head free_list;
struct ftrace_hash *hash;
struct hlist_node *tmp;
@@ -3510,6 +3755,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
int type = MATCH_FULL;
int i, len = 0;
char *search;
+ int ret;
if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
glob = NULL;
@@ -3568,8 +3814,11 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
* Remove after the disable is called. Otherwise, if the last
* probe is removed, a null hash means *all enabled*.
*/
- ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+ ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
synchronize_sched();
+ if (!ret)
+ free_ftrace_hash_rcu(old_hash);
+
list_for_each_entry_safe(entry, p, &free_list, free_list) {
list_del(&entry->free_list);
ftrace_free_entry(entry);
@@ -3756,10 +4005,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
return add_hash_entry(hash, ip);
}
-static void ftrace_ops_update_code(struct ftrace_ops *ops)
+static void ftrace_ops_update_code(struct ftrace_ops *ops,
+ struct ftrace_hash *old_hash)
{
if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
- ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+ ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
}
static int
@@ -3767,6 +4017,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
unsigned long ip, int remove, int reset, int enable)
{
struct ftrace_hash **orig_hash;
+ struct ftrace_hash *old_hash;
struct ftrace_hash *hash;
int ret;
@@ -3801,10 +4052,12 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
}
mutex_lock(&ftrace_lock);
+ old_hash = *orig_hash;
ret = ftrace_hash_move(ops, enable, orig_hash, hash);
- if (!ret)
- ftrace_ops_update_code(ops);
-
+ if (!ret) {
+ ftrace_ops_update_code(ops, old_hash);
+ free_ftrace_hash_rcu(old_hash);
+ }
mutex_unlock(&ftrace_lock);
out_regex_unlock:
@@ -3944,6 +4197,9 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
+static unsigned long save_global_trampoline;
+static unsigned long save_global_flags;
+
static int __init set_graph_function(char *str)
{
strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -4013,6 +4269,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
struct seq_file *m = (struct seq_file *)file->private_data;
struct ftrace_iterator *iter;
struct ftrace_hash **orig_hash;
+ struct ftrace_hash *old_hash;
struct trace_parser *parser;
int filter_hash;
int ret;
@@ -4042,11 +4299,13 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
orig_hash = &iter->ops->func_hash->notrace_hash;
mutex_lock(&ftrace_lock);
+ old_hash = *orig_hash;
ret = ftrace_hash_move(iter->ops, filter_hash,
orig_hash, iter->hash);
- if (!ret)
- ftrace_ops_update_code(iter->ops);
-
+ if (!ret) {
+ ftrace_ops_update_code(iter->ops, old_hash);
+ free_ftrace_hash_rcu(old_hash);
+ }
mutex_unlock(&ftrace_lock);
}
@@ -4149,9 +4408,9 @@ static int g_show(struct seq_file *m, void *v)
struct ftrace_graph_data *fgd = m->private;
if (fgd->table == ftrace_graph_funcs)
- seq_printf(m, "#### all functions enabled ####\n");
+ seq_puts(m, "#### all functions enabled ####\n");
else
- seq_printf(m, "#### no functions disabled ####\n");
+ seq_puts(m, "#### no functions disabled ####\n");
return 0;
}
@@ -4662,6 +4921,32 @@ void __init ftrace_init(void)
ftrace_disabled = 1;
}
+/* Do nothing if arch does not support this */
+void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+}
+
+static void ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+
+/*
+ * Currently there's no safe way to free a trampoline when the kernel
+ * is configured with PREEMPT. That is because a task could be preempted
+ * when it jumped to the trampoline, it may be preempted for a long time
+ * depending on the system load, and currently there's no way to know
+ * when it will be off the trampoline. If the trampoline is freed
+ * too early, when the task runs again, it will be executing on freed
+ * memory and crash.
+ */
+#ifdef CONFIG_PREEMPT
+ /* Currently, only non dynamic ops can have a trampoline */
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
+ return;
+#endif
+
+ arch_ftrace_update_trampoline(ops);
+}
+
#else
static struct ftrace_ops global_ops = {
@@ -4678,6 +4963,7 @@ core_initcall(ftrace_nodyn_init);
static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
static inline void ftrace_startup_enable(int command) { }
+static inline void ftrace_startup_all(int command) { }
/* Keep as macros so we do not need to define the commands */
# define ftrace_startup(ops, command) \
({ \
@@ -4703,6 +4989,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
return 1;
}
+static void ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+}
+
#endif /* CONFIG_DYNAMIC_FTRACE */
__init void ftrace_init_global_array_ops(struct trace_array *tr)
@@ -4827,6 +5117,56 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
}
#endif
+/*
+ * If there's only one function registered but it does not support
+ * recursion, this function will be called by the mcount trampoline.
+ * This function will handle recursion protection.
+ */
+static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs)
+{
+ int bit;
+
+ bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+ if (bit < 0)
+ return;
+
+ op->func(ip, parent_ip, op, regs);
+
+ trace_clear_recursion(bit);
+}
+
+/**
+ * ftrace_ops_get_func - get the function a trampoline should call
+ * @ops: the ops to get the function for
+ *
+ * Normally the mcount trampoline will call the ops->func, but there
+ * are times that it should not. For example, if the ops does not
+ * have its own recursion protection, then it should call the
+ * ftrace_ops_recurs_func() instead.
+ *
+ * Returns the function that the trampoline should call for @ops.
+ */
+ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
+{
+ /*
+ * If this is a dynamic ops or we force list func,
+ * then it needs to call the list anyway.
+ */
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+ return ftrace_ops_list_func;
+
+ /*
+ * If the func handles its own recursion, call it directly.
+ * Otherwise call the recursion protected function that
+ * will call the ftrace ops function.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE))
+ return ftrace_ops_recurs_func;
+
+ return ops->func;
+}
+
static void clear_ftrace_swapper(void)
{
struct task_struct *p;
@@ -4927,7 +5267,8 @@ static int ftrace_pid_add(int p)
set_ftrace_pid_task(pid);
ftrace_update_pid_func();
- ftrace_startup_enable(0);
+
+ ftrace_startup_all(0);
mutex_unlock(&ftrace_lock);
return 0;
@@ -4956,7 +5297,7 @@ static void ftrace_pid_reset(void)
}
ftrace_update_pid_func();
- ftrace_startup_enable(0);
+ ftrace_startup_all(0);
mutex_unlock(&ftrace_lock);
}
@@ -4989,12 +5330,12 @@ static int fpid_show(struct seq_file *m, void *v)
const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
if (v == (void *)1) {
- seq_printf(m, "no pid\n");
+ seq_puts(m, "no pid\n");
return 0;
}
if (fpid->pid == ftrace_swapper_pid)
- seq_printf(m, "swapper tasks\n");
+ seq_puts(m, "swapper tasks\n");
else
seq_printf(m, "%u\n", pid_vnr(fpid->pid));
@@ -5207,6 +5548,7 @@ static struct ftrace_ops graph_ops = {
FTRACE_OPS_FL_STUB,
#ifdef FTRACE_GRAPH_TRAMP_ADDR
.trampoline = FTRACE_GRAPH_TRAMP_ADDR,
+ /* trampoline_size is only needed for dynamically allocated tramps */
#endif
ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
};
@@ -5436,7 +5778,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
update_function_graph_func();
ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
-
out:
mutex_unlock(&ftrace_lock);
return ret;
@@ -5457,6 +5798,17 @@ void unregister_ftrace_graph(void)
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /*
+ * Function graph does not allocate the trampoline, but
+ * other global_ops do. We need to reset the ALLOC_TRAMP flag
+ * if one was used.
+ */
+ global_ops.trampoline = save_global_trampoline;
+ if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
+ global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
+#endif
+
out:
mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2d75c94ae87d..7a4104cb95cb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -34,21 +34,19 @@ static void update_pages_handler(struct work_struct *work);
*/
int ring_buffer_print_entry_header(struct trace_seq *s)
{
- int ret;
-
- ret = trace_seq_puts(s, "# compressed entry header\n");
- ret = trace_seq_puts(s, "\ttype_len : 5 bits\n");
- ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n");
- ret = trace_seq_puts(s, "\tarray : 32 bits\n");
- ret = trace_seq_putc(s, '\n');
- ret = trace_seq_printf(s, "\tpadding : type == %d\n",
- RINGBUF_TYPE_PADDING);
- ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
- RINGBUF_TYPE_TIME_EXTEND);
- ret = trace_seq_printf(s, "\tdata max type_len == %d\n",
- RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
+ trace_seq_puts(s, "# compressed entry header\n");
+ trace_seq_puts(s, "\ttype_len : 5 bits\n");
+ trace_seq_puts(s, "\ttime_delta : 27 bits\n");
+ trace_seq_puts(s, "\tarray : 32 bits\n");
+ trace_seq_putc(s, '\n');
+ trace_seq_printf(s, "\tpadding : type == %d\n",
+ RINGBUF_TYPE_PADDING);
+ trace_seq_printf(s, "\ttime_extend : type == %d\n",
+ RINGBUF_TYPE_TIME_EXTEND);
+ trace_seq_printf(s, "\tdata max type_len == %d\n",
+ RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
- return ret;
+ return !trace_seq_has_overflowed(s);
}
/*
@@ -419,32 +417,31 @@ static inline int test_time_stamp(u64 delta)
int ring_buffer_print_page_header(struct trace_seq *s)
{
struct buffer_data_page field;
- int ret;
- ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
- "offset:0;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)sizeof(field.time_stamp),
- (unsigned int)is_signed_type(u64));
-
- ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- (unsigned int)sizeof(field.commit),
- (unsigned int)is_signed_type(long));
-
- ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- 1,
- (unsigned int)is_signed_type(long));
-
- ret = trace_seq_printf(s, "\tfield: char data;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), data),
- (unsigned int)BUF_PAGE_SIZE,
- (unsigned int)is_signed_type(char));
+ trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+ "offset:0;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)sizeof(field.time_stamp),
+ (unsigned int)is_signed_type(u64));
- return ret;
+ trace_seq_printf(s, "\tfield: local_t commit;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ (unsigned int)sizeof(field.commit),
+ (unsigned int)is_signed_type(long));
+
+ trace_seq_printf(s, "\tfield: int overwrite;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ 1,
+ (unsigned int)is_signed_type(long));
+
+ trace_seq_printf(s, "\tfield: char data;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), data),
+ (unsigned int)BUF_PAGE_SIZE,
+ (unsigned int)is_signed_type(char));
+
+ return !trace_seq_has_overflowed(s);
}
struct rb_irq_work {
@@ -538,16 +535,18 @@ static void rb_wake_up_waiters(struct irq_work *work)
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
* @cpu: the cpu buffer to wait on
+ * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
* it will wait for data to be added to a specific cpu buffer.
*/
-int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
{
- struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
DEFINE_WAIT(wait);
struct rb_irq_work *work;
+ int ret = 0;
/*
* Depending on what the caller is waiting for, either any
@@ -564,36 +563,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
}
- prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+ while (true) {
+ prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
- /*
- * The events can happen in critical sections where
- * checking a work queue can cause deadlocks.
- * After adding a task to the queue, this flag is set
- * only to notify events to try to wake up the queue
- * using irq_work.
- *
- * We don't clear it even if the buffer is no longer
- * empty. The flag only causes the next event to run
- * irq_work to do the work queue wake up. The worse
- * that can happen if we race with !trace_empty() is that
- * an event will cause an irq_work to try to wake up
- * an empty queue.
- *
- * There's no reason to protect this flag either, as
- * the work queue and irq_work logic will do the necessary
- * synchronization for the wake ups. The only thing
- * that is necessary is that the wake up happens after
- * a task has been queued. It's OK for spurious wake ups.
- */
- work->waiters_pending = true;
+ /*
+ * The events can happen in critical sections where
+ * checking a work queue can cause deadlocks.
+ * After adding a task to the queue, this flag is set
+ * only to notify events to try to wake up the queue
+ * using irq_work.
+ *
+ * We don't clear it even if the buffer is no longer
+ * empty. The flag only causes the next event to run
+ * irq_work to do the work queue wake up. The worse
+ * that can happen if we race with !trace_empty() is that
+ * an event will cause an irq_work to try to wake up
+ * an empty queue.
+ *
+ * There's no reason to protect this flag either, as
+ * the work queue and irq_work logic will do the necessary
+ * synchronization for the wake ups. The only thing
+ * that is necessary is that the wake up happens after
+ * a task has been queued. It's OK for spurious wake ups.
+ */
+ work->waiters_pending = true;
+
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
+ break;
+
+ if (cpu != RING_BUFFER_ALL_CPUS &&
+ !ring_buffer_empty_cpu(buffer, cpu)) {
+ unsigned long flags;
+ bool pagebusy;
+
+ if (!full)
+ break;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ if (!pagebusy)
+ break;
+ }
- if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
- (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
schedule();
+ }
finish_wait(&work->waiters, &wait);
- return 0;
+
+ return ret;
}
/**
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 0434ff1b808e..3f9e328c30b5 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -205,7 +205,6 @@ static void ring_buffer_consumer(void)
break;
schedule();
- __set_current_state(TASK_RUNNING);
}
reader_finish = 0;
complete(&read_done);
@@ -379,7 +378,6 @@ static int ring_buffer_consumer_thread(void *arg)
break;
schedule();
- __set_current_state(TASK_RUNNING);
}
__set_current_state(TASK_RUNNING);
@@ -407,7 +405,6 @@ static int ring_buffer_producer_thread(void *arg)
trace_printk("Sleeping for 10 secs\n");
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ * SLEEP_TIME);
- __set_current_state(TASK_RUNNING);
}
if (kill_test)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a528392b1f4..2e767972e99c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -63,6 +63,10 @@ static bool __read_mostly tracing_selftest_running;
*/
bool __read_mostly tracing_selftest_disabled;
+/* Pipe tracepoints to printk */
+struct trace_iterator *tracepoint_print_iter;
+int tracepoint_printk;
+
/* For tracers that don't implement custom flags */
static struct tracer_opt dummy_tracer_opt[] = {
{ }
@@ -155,10 +159,11 @@ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
static int __init stop_trace_on_warning(char *str)
{
- __disable_trace_on_warning = 1;
+ if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
+ __disable_trace_on_warning = 1;
return 1;
}
-__setup("traceoff_on_warning=", stop_trace_on_warning);
+__setup("traceoff_on_warning", stop_trace_on_warning);
static int __init boot_alloc_snapshot(char *str)
{
@@ -192,6 +197,13 @@ static int __init set_trace_boot_clock(char *str)
}
__setup("trace_clock=", set_trace_boot_clock);
+static int __init set_tracepoint_printk(char *str)
+{
+ if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
+ tracepoint_printk = 1;
+ return 1;
+}
+__setup("tp_printk", set_tracepoint_printk);
unsigned long long ns2usecs(cycle_t nsec)
{
@@ -938,19 +950,20 @@ out:
return ret;
}
+/* TODO add a seq_buf_to_buffer() */
static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
{
int len;
- if (s->len <= s->readpos)
+ if (trace_seq_used(s) <= s->seq.readpos)
return -EBUSY;
- len = s->len - s->readpos;
+ len = trace_seq_used(s) - s->seq.readpos;
if (cnt > len)
cnt = len;
- memcpy(buf, s->buffer + s->readpos, cnt);
+ memcpy(buf, s->buffer + s->seq.readpos, cnt);
- s->readpos += cnt;
+ s->seq.readpos += cnt;
return cnt;
}
@@ -1076,13 +1089,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
}
#endif /* CONFIG_TRACER_MAX_TRACE */
-static int wait_on_pipe(struct trace_iterator *iter)
+static int wait_on_pipe(struct trace_iterator *iter, bool full)
{
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
return 0;
- return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
+ return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file,
+ full);
}
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -2028,7 +2042,7 @@ void trace_printk_init_buffers(void)
pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
pr_warning("** **\n");
pr_warning("** This means that this is a DEBUG kernel and it is **\n");
- pr_warning("** unsafe for produciton use. **\n");
+ pr_warning("** unsafe for production use. **\n");
pr_warning("** **\n");
pr_warning("** If you see this message and you are not debugging **\n");
pr_warning("** the kernel, report this immediately to your vendor! **\n");
@@ -2157,9 +2171,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
goto out;
}
- len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
- if (len > TRACE_BUF_SIZE)
- goto out;
+ len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
local_save_flags(flags);
size = sizeof(*entry) + len + 1;
@@ -2170,8 +2182,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
entry = ring_buffer_event_data(event);
entry->ip = ip;
- memcpy(&entry->buf, tbuffer, len);
- entry->buf[len] = '\0';
+ memcpy(&entry->buf, tbuffer, len + 1);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(buffer, flags, 6, pc);
@@ -2508,14 +2519,14 @@ get_total_entries(struct trace_buffer *buf,
static void print_lat_help_header(struct seq_file *m)
{
- seq_puts(m, "# _------=> CPU# \n");
- seq_puts(m, "# / _-----=> irqs-off \n");
- seq_puts(m, "# | / _----=> need-resched \n");
- seq_puts(m, "# || / _---=> hardirq/softirq \n");
- seq_puts(m, "# ||| / _--=> preempt-depth \n");
- seq_puts(m, "# |||| / delay \n");
- seq_puts(m, "# cmd pid ||||| time | caller \n");
- seq_puts(m, "# \\ / ||||| \\ | / \n");
+ seq_puts(m, "# _------=> CPU# \n"
+ "# / _-----=> irqs-off \n"
+ "# | / _----=> need-resched \n"
+ "# || / _---=> hardirq/softirq \n"
+ "# ||| / _--=> preempt-depth \n"
+ "# |||| / delay \n"
+ "# cmd pid ||||| time | caller \n"
+ "# \\ / ||||| \\ | / \n");
}
static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
@@ -2532,20 +2543,20 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
{
print_event_info(buf, m);
- seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
- seq_puts(m, "# | | | | |\n");
+ seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"
+ "# | | | | |\n");
}
static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
{
print_event_info(buf, m);
- seq_puts(m, "# _-----=> irqs-off\n");
- seq_puts(m, "# / _----=> need-resched\n");
- seq_puts(m, "# | / _---=> hardirq/softirq\n");
- seq_puts(m, "# || / _--=> preempt-depth\n");
- seq_puts(m, "# ||| / delay\n");
- seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n");
- seq_puts(m, "# | | | |||| | |\n");
+ seq_puts(m, "# _-----=> irqs-off\n"
+ "# / _----=> need-resched\n"
+ "# | / _---=> hardirq/softirq\n"
+ "# || / _--=> preempt-depth\n"
+ "# ||| / delay\n"
+ "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"
+ "# | | | |||| | |\n");
}
void
@@ -2648,24 +2659,21 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
- if (!trace_print_lat_context(iter))
- goto partial;
- } else {
- if (!trace_print_context(iter))
- goto partial;
- }
+ if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+ trace_print_lat_context(iter);
+ else
+ trace_print_context(iter);
}
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
if (event)
return event->funcs->trace(iter, sym_flags, event);
- if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
- goto partial;
+ trace_seq_printf(s, "Unknown type %d\n", entry->type);
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
@@ -2676,22 +2684,20 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
entry = iter->ent;
- if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- if (!trace_seq_printf(s, "%d %d %llu ",
- entry->pid, iter->cpu, iter->ts))
- goto partial;
- }
+ if (trace_flags & TRACE_ITER_CONTEXT_INFO)
+ trace_seq_printf(s, "%d %d %llu ",
+ entry->pid, iter->cpu, iter->ts);
+
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
event = ftrace_find_event(entry->type);
if (event)
return event->funcs->raw(iter, 0, event);
- if (!trace_seq_printf(s, "%d ?\n", entry->type))
- goto partial;
+ trace_seq_printf(s, "%d ?\n", entry->type);
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
@@ -2704,9 +2710,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
entry = iter->ent;
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
- SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
- SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
+ SEQ_PUT_HEX_FIELD(s, entry->pid);
+ SEQ_PUT_HEX_FIELD(s, iter->cpu);
+ SEQ_PUT_HEX_FIELD(s, iter->ts);
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
}
event = ftrace_find_event(entry->type);
@@ -2716,9 +2724,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
return ret;
}
- SEQ_PUT_FIELD_RET(s, newline);
+ SEQ_PUT_FIELD(s, newline);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
@@ -2730,9 +2738,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
entry = iter->ent;
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- SEQ_PUT_FIELD_RET(s, entry->pid);
- SEQ_PUT_FIELD_RET(s, iter->cpu);
- SEQ_PUT_FIELD_RET(s, iter->ts);
+ SEQ_PUT_FIELD(s, entry->pid);
+ SEQ_PUT_FIELD(s, iter->cpu);
+ SEQ_PUT_FIELD(s, iter->ts);
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
}
event = ftrace_find_event(entry->type);
@@ -2778,10 +2788,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
{
enum print_line_t ret;
- if (iter->lost_events &&
- !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
- iter->cpu, iter->lost_events))
- return TRACE_TYPE_PARTIAL_LINE;
+ if (iter->lost_events) {
+ trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+ iter->cpu, iter->lost_events);
+ if (trace_seq_has_overflowed(&iter->seq))
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
if (iter->trace && iter->trace->print_line) {
ret = iter->trace->print_line(iter);
@@ -2859,44 +2871,44 @@ static void test_ftrace_alive(struct seq_file *m)
{
if (!ftrace_is_dead())
return;
- seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n");
- seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n");
+ seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"
+ "# MAY BE MISSING FUNCTION EVENTS\n");
}
#ifdef CONFIG_TRACER_MAX_TRACE
static void show_snapshot_main_help(struct seq_file *m)
{
- seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
- seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
- seq_printf(m, "# Takes a snapshot of the main buffer.\n");
- seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n");
- seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
- seq_printf(m, "# is not a '0' or '1')\n");
+ seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
+ "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer.\n"
+ "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
}
static void show_snapshot_percpu_help(struct seq_file *m)
{
- seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
+ seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
- seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
- seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n");
+ seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer for this cpu.\n");
#else
- seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n");
- seq_printf(m, "# Must use main snapshot file to allocate.\n");
+ seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
+ "# Must use main snapshot file to allocate.\n");
#endif
- seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n");
- seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
- seq_printf(m, "# is not a '0' or '1')\n");
+ seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
}
static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
{
if (iter->tr->allocated_snapshot)
- seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
+ seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
else
- seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
+ seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
- seq_printf(m, "# Snapshot commands:\n");
+ seq_puts(m, "# Snapshot commands:\n");
if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
show_snapshot_main_help(m);
else
@@ -3250,7 +3262,7 @@ static int t_show(struct seq_file *m, void *v)
if (!t)
return 0;
- seq_printf(m, "%s", t->name);
+ seq_puts(m, t->name);
if (t->next)
seq_putc(m, ' ');
else
@@ -4313,6 +4325,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
goto out;
}
+ trace_seq_init(&iter->seq);
+
/*
* We make a copy of the current tracer to avoid concurrent
* changes on it while we are reading.
@@ -4434,15 +4448,12 @@ static int tracing_wait_pipe(struct file *filp)
mutex_unlock(&iter->mutex);
- ret = wait_on_pipe(iter);
+ ret = wait_on_pipe(iter, false);
mutex_lock(&iter->mutex);
if (ret)
return ret;
-
- if (signal_pending(current))
- return -EINTR;
}
return 1;
@@ -4509,18 +4520,18 @@ waitagain:
trace_access_lock(iter->cpu_file);
while (trace_find_next_entry_inc(iter) != NULL) {
enum print_line_t ret;
- int len = iter->seq.len;
+ int save_len = iter->seq.seq.len;
ret = print_trace_line(iter);
if (ret == TRACE_TYPE_PARTIAL_LINE) {
/* don't print partial lines */
- iter->seq.len = len;
+ iter->seq.seq.len = save_len;
break;
}
if (ret != TRACE_TYPE_NO_CONSUME)
trace_consume(iter);
- if (iter->seq.len >= cnt)
+ if (trace_seq_used(&iter->seq) >= cnt)
break;
/*
@@ -4536,7 +4547,7 @@ waitagain:
/* Now copy what we have to the user */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
- if (iter->seq.readpos >= iter->seq.len)
+ if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
trace_seq_init(&iter->seq);
/*
@@ -4570,20 +4581,33 @@ static size_t
tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
{
size_t count;
+ int save_len;
int ret;
/* Seq buffer is page-sized, exactly what we need. */
for (;;) {
- count = iter->seq.len;
+ save_len = iter->seq.seq.len;
ret = print_trace_line(iter);
- count = iter->seq.len - count;
- if (rem < count) {
- rem = 0;
- iter->seq.len -= count;
+
+ if (trace_seq_has_overflowed(&iter->seq)) {
+ iter->seq.seq.len = save_len;
break;
}
+
+ /*
+ * This should not be hit, because it should only
+ * be set if the iter->seq overflowed. But check it
+ * anyway to be safe.
+ */
if (ret == TRACE_TYPE_PARTIAL_LINE) {
- iter->seq.len -= count;
+ iter->seq.seq.len = save_len;
+ break;
+ }
+
+ count = trace_seq_used(&iter->seq) - save_len;
+ if (rem < count) {
+ rem = 0;
+ iter->seq.seq.len = save_len;
break;
}
@@ -4664,13 +4688,13 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
/* Copy the data into the page, so we can start over. */
ret = trace_seq_to_buffer(&iter->seq,
page_address(spd.pages[i]),
- iter->seq.len);
+ trace_seq_used(&iter->seq));
if (ret < 0) {
__free_page(spd.pages[i]);
break;
}
spd.partial[i].offset = 0;
- spd.partial[i].len = iter->seq.len;
+ spd.partial[i].len = trace_seq_used(&iter->seq);
trace_seq_init(&iter->seq);
}
@@ -5372,16 +5396,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
goto out_unlock;
}
mutex_unlock(&trace_types_lock);
- ret = wait_on_pipe(iter);
+ ret = wait_on_pipe(iter, false);
mutex_lock(&trace_types_lock);
if (ret) {
size = ret;
goto out_unlock;
}
- if (signal_pending(current)) {
- size = -EINTR;
- goto out_unlock;
- }
goto again;
}
size = 0;
@@ -5500,7 +5520,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
};
struct buffer_ref *ref;
int entries, size, i;
- ssize_t ret;
+ ssize_t ret = 0;
mutex_lock(&trace_types_lock);
@@ -5538,13 +5558,16 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
int r;
ref = kzalloc(sizeof(*ref), GFP_KERNEL);
- if (!ref)
+ if (!ref) {
+ ret = -ENOMEM;
break;
+ }
ref->ref = 1;
ref->buffer = iter->trace_buffer->buffer;
ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
if (!ref->page) {
+ ret = -ENOMEM;
kfree(ref);
break;
}
@@ -5582,19 +5605,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
+ if (ret)
+ goto out;
+
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
ret = -EAGAIN;
goto out;
}
mutex_unlock(&trace_types_lock);
- ret = wait_on_pipe(iter);
+ ret = wait_on_pipe(iter, true);
mutex_lock(&trace_types_lock);
if (ret)
goto out;
- if (signal_pending(current)) {
- ret = -EINTR;
- goto out;
- }
+
goto again;
}
@@ -5671,7 +5694,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
trace_seq_printf(s, "read events: %ld\n", cnt);
- count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+ count = simple_read_from_buffer(ubuf, count, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -5752,10 +5776,10 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
seq_printf(m, "%ps:", (void *)ip);
- seq_printf(m, "snapshot");
+ seq_puts(m, "snapshot");
if (count == -1)
- seq_printf(m, ":unlimited\n");
+ seq_puts(m, ":unlimited\n");
else
seq_printf(m, ":count=%ld\n", count);
@@ -6420,7 +6444,7 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m
int ret;
/* Paranoid: Make sure the parent is the "instances" directory */
- parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+ parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
if (WARN_ON_ONCE(parent != trace_instance_dir))
return -ENOENT;
@@ -6447,7 +6471,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry)
int ret;
/* Paranoid: Make sure the parent is the "instances" directory */
- parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+ parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
if (WARN_ON_ONCE(parent != trace_instance_dir))
return -ENOENT;
@@ -6634,11 +6658,19 @@ void
trace_printk_seq(struct trace_seq *s)
{
/* Probably should print a warning here. */
- if (s->len >= TRACE_MAX_PRINT)
- s->len = TRACE_MAX_PRINT;
+ if (s->seq.len >= TRACE_MAX_PRINT)
+ s->seq.len = TRACE_MAX_PRINT;
+
+ /*
+ * More paranoid code. Although the buffer size is set to
+ * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just
+ * an extra layer of protection.
+ */
+ if (WARN_ON_ONCE(s->seq.len >= s->seq.size))
+ s->seq.len = s->seq.size - 1;
/* should be zero ended, but we are paranoid. */
- s->buffer[s->len] = 0;
+ s->buffer[s->seq.len] = 0;
printk(KERN_TRACE "%s", s->buffer);
@@ -6877,6 +6909,19 @@ out:
return ret;
}
+void __init trace_init(void)
+{
+ if (tracepoint_printk) {
+ tracepoint_print_iter =
+ kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
+ if (WARN_ON(!tracepoint_print_iter))
+ tracepoint_printk = 0;
+ }
+ tracer_alloc_buffers();
+ init_ftrace_syscalls();
+ trace_event_init();
+}
+
__init static int clear_boot_tracer(void)
{
/*
@@ -6896,6 +6941,5 @@ __init static int clear_boot_tracer(void)
return 0;
}
-early_initcall(tracer_alloc_buffers);
fs_initcall(tracer_init_debugfs);
late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 385391fb1d3b..8de48bac1ce2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -14,6 +14,7 @@
#include <linux/trace_seq.h>
#include <linux/ftrace_event.h>
#include <linux/compiler.h>
+#include <linux/trace_seq.h>
#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -569,15 +570,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
void tracing_iter_reset(struct trace_iterator *iter, int cpu);
-void tracing_sched_switch_trace(struct trace_array *tr,
- struct task_struct *prev,
- struct task_struct *next,
- unsigned long flags, int pc);
-
-void tracing_sched_wakeup_trace(struct trace_array *tr,
- struct task_struct *wakee,
- struct task_struct *cur,
- unsigned long flags, int pc);
void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
@@ -597,9 +589,6 @@ void set_graph_array(struct trace_array *tr);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
-void tracing_sched_switch_assign_trace(struct trace_array *tr);
-void tracing_stop_sched_switch_record(void);
-void tracing_start_sched_switch_record(void);
int register_tracer(struct tracer *type);
int is_tracing_stopped(void);
@@ -719,6 +708,8 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
extern unsigned long trace_flags;
+extern char trace_find_mark(unsigned long long duration);
+
/* Standard output formatting function used for function return traces */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -737,7 +728,7 @@ extern unsigned long trace_flags;
extern enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags);
extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
-extern enum print_line_t
+extern void
trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
extern void graph_trace_open(struct trace_iterator *iter);
extern void graph_trace_close(struct trace_iterator *iter);
@@ -1310,4 +1301,18 @@ int perf_ftrace_event_register(struct ftrace_event_call *call,
#define perf_ftrace_event_register NULL
#endif
+#ifdef CONFIG_FTRACE_SYSCALLS
+void init_ftrace_syscalls(void);
+#else
+static inline void init_ftrace_syscalls(void) { }
+#endif
+
+#ifdef CONFIG_EVENT_TRACING
+void trace_event_init(void);
+#else
+static inline void __init trace_event_init(void) { }
+#endif
+
+extern struct trace_iterator *tracepoint_print_iter;
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 697fb9bac8f0..7d6e2afde669 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -151,22 +151,21 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
- field->correct ? " ok " : " MISS ",
- field->func,
- field->file,
- field->line))
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
+ field->correct ? " ok " : " MISS ",
+ field->func,
+ field->file,
+ field->line);
+
+ return trace_handle_return(&iter->seq);
}
static void branch_print_header(struct seq_file *s)
{
seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT"
- " FUNC:FILE:LINE\n");
- seq_puts(s, "# | | | | | "
- " |\n");
+ " FUNC:FILE:LINE\n"
+ "# | | | | | "
+ " |\n");
}
static struct trace_event_functions trace_branch_funcs = {
@@ -233,12 +232,12 @@ extern unsigned long __stop_annotated_branch_profile[];
static int annotated_branch_stat_headers(struct seq_file *m)
{
- seq_printf(m, " correct incorrect %% ");
- seq_printf(m, " Function "
- " File Line\n"
- " ------- --------- - "
- " -------- "
- " ---- ----\n");
+ seq_puts(m, " correct incorrect % "
+ " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
return 0;
}
@@ -274,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
if (percent < 0)
- seq_printf(m, " X ");
+ seq_puts(m, " X ");
else
seq_printf(m, "%3ld ", percent);
seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
@@ -362,12 +361,12 @@ extern unsigned long __stop_branch_profile[];
static int all_branch_stat_headers(struct seq_file *m)
{
- seq_printf(m, " miss hit %% ");
- seq_printf(m, " Function "
- " File Line\n"
- " ------- --------- - "
- " -------- "
- " ---- ----\n");
+ seq_puts(m, " miss hit % "
+ " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
return 0;
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ef06ce7e9cf8..366a78a3e61e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -212,8 +212,40 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
}
EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
+static DEFINE_SPINLOCK(tracepoint_iter_lock);
+
+static void output_printk(struct ftrace_event_buffer *fbuffer)
+{
+ struct ftrace_event_call *event_call;
+ struct trace_event *event;
+ unsigned long flags;
+ struct trace_iterator *iter = tracepoint_print_iter;
+
+ if (!iter)
+ return;
+
+ event_call = fbuffer->ftrace_file->event_call;
+ if (!event_call || !event_call->event.funcs ||
+ !event_call->event.funcs->trace)
+ return;
+
+ event = &fbuffer->ftrace_file->event_call->event;
+
+ spin_lock_irqsave(&tracepoint_iter_lock, flags);
+ trace_seq_init(&iter->seq);
+ iter->ent = fbuffer->entry;
+ event_call->event.funcs->trace(iter, 0, event);
+ trace_seq_putc(&iter->seq, 0);
+ printk("%s", iter->seq.buffer);
+
+ spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
+}
+
void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
{
+ if (tracepoint_printk)
+ output_printk(fbuffer);
+
event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
fbuffer->event, fbuffer->entry,
fbuffer->flags, fbuffer->pc);
@@ -461,7 +493,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
if (dir) {
spin_lock(&dir->d_lock); /* probably unneeded */
- list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) {
+ list_for_each_entry(child, &dir->d_subdirs, d_child) {
if (child->d_inode) /* probably unneeded */
child->d_inode->i_private = NULL;
}
@@ -918,7 +950,7 @@ static int f_show(struct seq_file *m, void *v)
case FORMAT_HEADER:
seq_printf(m, "name: %s\n", ftrace_event_name(call));
seq_printf(m, "ID: %d\n", call->event.type);
- seq_printf(m, "format:\n");
+ seq_puts(m, "format:\n");
return 0;
case FORMAT_FIELD_SEPERATOR:
@@ -1044,7 +1076,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_unlock(&event_mutex);
if (file)
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -1210,7 +1243,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_init(s);
print_subsystem_event_filter(system, s);
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -1265,7 +1299,8 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
trace_seq_init(s);
func(s);
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -1988,7 +2023,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
ftrace_event_name(data->file->event_call));
if (data->count == -1)
- seq_printf(m, ":unlimited\n");
+ seq_puts(m, ":unlimited\n");
else
seq_printf(m, ":count=%ld\n", data->count);
@@ -2477,8 +2512,14 @@ static __init int event_trace_init(void)
#endif
return 0;
}
-early_initcall(event_trace_memsetup);
-core_initcall(event_trace_enable);
+
+void __init trace_event_init(void)
+{
+ event_trace_memsetup();
+ init_ftrace_syscalls();
+ event_trace_enable();
+}
+
fs_initcall(event_trace_init);
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -2513,8 +2554,11 @@ static __init int event_test_thread(void *unused)
kfree(test_malloc);
set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop())
+ while (!kthread_should_stop()) {
schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
return 0;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 7a8c1528e141..ced69da0ff55 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -45,6 +45,7 @@ enum filter_op_ids
OP_GT,
OP_GE,
OP_BAND,
+ OP_NOT,
OP_NONE,
OP_OPEN_PAREN,
};
@@ -67,6 +68,7 @@ static struct filter_op filter_ops[] = {
{ OP_GT, ">", 5 },
{ OP_GE, ">=", 5 },
{ OP_BAND, "&", 6 },
+ { OP_NOT, "!", 6 },
{ OP_NONE, "OP_NONE", 0 },
{ OP_OPEN_PAREN, "(", 0 },
};
@@ -85,6 +87,7 @@ enum {
FILT_ERR_MISSING_FIELD,
FILT_ERR_INVALID_FILTER,
FILT_ERR_IP_FIELD_ONLY,
+ FILT_ERR_ILLEGAL_NOT_OP,
};
static char *err_text[] = {
@@ -101,6 +104,7 @@ static char *err_text[] = {
"Missing field name and/or value",
"Meaningless filter expression",
"Only 'ip' field is supported for function trace",
+ "Illegal use of '!'",
};
struct opstack_op {
@@ -139,6 +143,7 @@ struct pred_stack {
int index;
};
+/* If not of not match is equal to not of not, then it is a match */
#define DEFINE_COMPARISON_PRED(type) \
static int filter_pred_##type(struct filter_pred *pred, void *event) \
{ \
@@ -166,7 +171,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \
break; \
} \
\
- return match; \
+ return !!match == !pred->not; \
}
#define DEFINE_EQUALITY_PRED(size) \
@@ -484,9 +489,10 @@ static int process_ops(struct filter_pred *preds,
if (!WARN_ON_ONCE(!pred->fn))
match = pred->fn(pred, rec);
if (!!match == type)
- return match;
+ break;
}
- return match;
+ /* If not of not match is equal to not of not, then it is a match */
+ return !!match == !op->not;
}
struct filter_match_preds_data {
@@ -735,10 +741,10 @@ static int filter_set_pred(struct event_filter *filter,
* then this op can be folded.
*/
if (left->index & FILTER_PRED_FOLD &&
- (left->op == dest->op ||
+ ((left->op == dest->op && !left->not) ||
left->left == FILTER_PRED_INVALID) &&
right->index & FILTER_PRED_FOLD &&
- (right->op == dest->op ||
+ ((right->op == dest->op && !right->not) ||
right->left == FILTER_PRED_INVALID))
dest->index |= FILTER_PRED_FOLD;
@@ -1028,7 +1034,7 @@ static int init_pred(struct filter_parse_state *ps,
}
if (pred->op == OP_NE)
- pred->not = 1;
+ pred->not ^= 1;
pred->fn = fn;
return 0;
@@ -1590,6 +1596,17 @@ static int replace_preds(struct ftrace_event_call *call,
continue;
}
+ if (elt->op == OP_NOT) {
+ if (!n_preds || operand1 || operand2) {
+ parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0);
+ err = -EINVAL;
+ goto fail;
+ }
+ if (!dry_run)
+ filter->preds[n_preds - 1].not ^= 1;
+ continue;
+ }
+
if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
err = -ENOSPC;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 4747b476a030..8712df9decb4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -373,7 +373,7 @@ event_trigger_print(const char *name, struct seq_file *m,
{
long count = (long)data;
- seq_printf(m, "%s", name);
+ seq_puts(m, name);
if (count == -1)
seq_puts(m, ":unlimited");
@@ -383,7 +383,7 @@ event_trigger_print(const char *name, struct seq_file *m,
if (filter_str)
seq_printf(m, " if %s\n", filter_str);
else
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
@@ -1105,7 +1105,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
if (data->filter_str)
seq_printf(m, " if %s\n", data->filter_str);
else
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 57f0ec962d2c..fcd41a166405 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -261,37 +261,74 @@ static struct tracer function_trace __tracer_data =
};
#ifdef CONFIG_DYNAMIC_FTRACE
-static int update_count(void **data)
+static void update_traceon_count(void **data, bool on)
{
- unsigned long *count = (long *)data;
+ long *count = (long *)data;
+ long old_count = *count;
- if (!*count)
- return 0;
+ /*
+ * Tracing gets disabled (or enabled) once per count.
+ * This function can be called at the same time on multiple CPUs.
+ * It is fine if both disable (or enable) tracing, as disabling
+ * (or enabling) the second time doesn't do anything as the
+ * state of the tracer is already disabled (or enabled).
+ * What needs to be synchronized in this case is that the count
+ * only gets decremented once, even if the tracer is disabled
+ * (or enabled) twice, as the second one is really a nop.
+ *
+ * The memory barriers guarantee that we only decrement the
+ * counter once. First the count is read to a local variable
+ * and a read barrier is used to make sure that it is loaded
+ * before checking if the tracer is in the state we want.
+ * If the tracer is not in the state we want, then the count
+ * is guaranteed to be the old count.
+ *
+ * Next the tracer is set to the state we want (disabled or enabled)
+ * then a write memory barrier is used to make sure that
+ * the new state is visible before changing the counter by
+ * one minus the old counter. This guarantees that another CPU
+ * executing this code will see the new state before seeing
+ * the new counter value, and would not do anything if the new
+ * counter is seen.
+ *
+ * Note, there is no synchronization between this and a user
+ * setting the tracing_on file. But we currently don't care
+ * about that.
+ */
+ if (!old_count)
+ return;
- if (*count != -1)
- (*count)--;
+ /* Make sure we see count before checking tracing state */
+ smp_rmb();
- return 1;
+ if (on == !!tracing_is_on())
+ return;
+
+ if (on)
+ tracing_on();
+ else
+ tracing_off();
+
+ /* unlimited? */
+ if (old_count == -1)
+ return;
+
+ /* Make sure tracing state is visible before updating count */
+ smp_wmb();
+
+ *count = old_count - 1;
}
static void
ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
{
- if (tracing_is_on())
- return;
-
- if (update_count(data))
- tracing_on();
+ update_traceon_count(data, 1);
}
static void
ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
{
- if (!tracing_is_on())
- return;
-
- if (update_count(data))
- tracing_off();
+ update_traceon_count(data, 0);
}
static void
@@ -330,11 +367,49 @@ ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
static void
ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
{
- if (!tracing_is_on())
- return;
+ long *count = (long *)data;
+ long old_count;
+ long new_count;
- if (update_count(data))
- trace_dump_stack(STACK_SKIP);
+ /*
+ * Stack traces should only execute the number of times the
+ * user specified in the counter.
+ */
+ do {
+
+ if (!tracing_is_on())
+ return;
+
+ old_count = *count;
+
+ if (!old_count)
+ return;
+
+ /* unlimited? */
+ if (old_count == -1) {
+ trace_dump_stack(STACK_SKIP);
+ return;
+ }
+
+ new_count = old_count - 1;
+ new_count = cmpxchg(count, old_count, new_count);
+ if (new_count == old_count)
+ trace_dump_stack(STACK_SKIP);
+
+ } while (new_count != old_count);
+}
+
+static int update_count(void **data)
+{
+ unsigned long *count = (long *)data;
+
+ if (!*count)
+ return 0;
+
+ if (*count != -1)
+ (*count)--;
+
+ return 1;
}
static void
@@ -361,7 +436,7 @@ ftrace_probe_print(const char *name, struct seq_file *m,
seq_printf(m, "%ps:%s", (void *)ip, name);
if (count == -1)
- seq_printf(m, ":unlimited\n");
+ seq_puts(m, ":unlimited\n");
else
seq_printf(m, ":count=%ld\n", count);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index f0a0c982cde3..ba476009e5de 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -107,7 +107,7 @@ enum {
FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,
};
-static enum print_line_t
+static void
print_graph_duration(unsigned long long duration, struct trace_seq *s,
u32 flags);
@@ -483,33 +483,24 @@ static int graph_trace_update_thresh(struct trace_array *tr)
static int max_bytes_for_cpu;
-static enum print_line_t
-print_graph_cpu(struct trace_seq *s, int cpu)
+static void print_graph_cpu(struct trace_seq *s, int cpu)
{
- int ret;
-
/*
* Start with a space character - to make it stand out
* to the right a bit when trace output is pasted into
* email:
*/
- ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
}
#define TRACE_GRAPH_PROCINFO_LENGTH 14
-static enum print_line_t
-print_graph_proc(struct trace_seq *s, pid_t pid)
+static void print_graph_proc(struct trace_seq *s, pid_t pid)
{
char comm[TASK_COMM_LEN];
/* sign + log10(MAX_INT) + '\0' */
char pid_str[11];
int spaces = 0;
- int ret;
int len;
int i;
@@ -524,56 +515,43 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
/* First spaces to align center */
- for (i = 0; i < spaces / 2; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < spaces / 2; i++)
+ trace_seq_putc(s, ' ');
- ret = trace_seq_printf(s, "%s-%s", comm, pid_str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s-%s", comm, pid_str);
/* Last spaces to align center */
- for (i = 0; i < spaces - (spaces / 2); i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- return TRACE_TYPE_HANDLED;
+ for (i = 0; i < spaces - (spaces / 2); i++)
+ trace_seq_putc(s, ' ');
}
-static enum print_line_t
-print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
+static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
{
- if (!trace_seq_putc(s, ' '))
- return 0;
-
- return trace_print_lat_fmt(s, entry);
+ trace_seq_putc(s, ' ');
+ trace_print_lat_fmt(s, entry);
}
/* If the pid changed since the last trace, output this event */
-static enum print_line_t
+static void
verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
{
pid_t prev_pid;
pid_t *last_pid;
- int ret;
if (!data)
- return TRACE_TYPE_HANDLED;
+ return;
last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
if (*last_pid == pid)
- return TRACE_TYPE_HANDLED;
+ return;
prev_pid = *last_pid;
*last_pid = pid;
if (prev_pid == -1)
- return TRACE_TYPE_HANDLED;
+ return;
/*
* Context-switch trace line:
@@ -582,33 +560,12 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
------------------------------------------
*/
- ret = trace_seq_puts(s,
- " ------------------------------------------\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_cpu(s, cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_proc(s, prev_pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_puts(s, " => ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_proc(s, pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_puts(s,
- "\n ------------------------------------------\n\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_puts(s, " ------------------------------------------\n");
+ print_graph_cpu(s, cpu);
+ print_graph_proc(s, prev_pid);
+ trace_seq_puts(s, " => ");
+ print_graph_proc(s, pid);
+ trace_seq_puts(s, "\n ------------------------------------------\n\n");
}
static struct ftrace_graph_ret_entry *
@@ -682,175 +639,122 @@ get_return_for_leaf(struct trace_iterator *iter,
return next;
}
-static int print_graph_abs_time(u64 t, struct trace_seq *s)
+static void print_graph_abs_time(u64 t, struct trace_seq *s)
{
unsigned long usecs_rem;
usecs_rem = do_div(t, NSEC_PER_SEC);
usecs_rem /= 1000;
- return trace_seq_printf(s, "%5lu.%06lu | ",
- (unsigned long)t, usecs_rem);
+ trace_seq_printf(s, "%5lu.%06lu | ",
+ (unsigned long)t, usecs_rem);
}
-static enum print_line_t
+static void
print_graph_irq(struct trace_iterator *iter, unsigned long addr,
enum trace_type type, int cpu, pid_t pid, u32 flags)
{
- int ret;
struct trace_seq *s = &iter->seq;
+ struct trace_entry *ent = iter->ent;
if (addr < (unsigned long)__irqentry_text_start ||
addr >= (unsigned long)__irqentry_text_end)
- return TRACE_TYPE_UNHANDLED;
+ return;
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
/* Absolute time */
- if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
- ret = print_graph_abs_time(iter->ts, s);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
+ print_graph_abs_time(iter->ts, s);
/* Cpu */
- if (flags & TRACE_GRAPH_PRINT_CPU) {
- ret = print_graph_cpu(s, cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ print_graph_cpu(s, cpu);
/* Proc */
if (flags & TRACE_GRAPH_PRINT_PROC) {
- ret = print_graph_proc(s, pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- ret = trace_seq_puts(s, " | ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_proc(s, pid);
+ trace_seq_puts(s, " | ");
}
+
+ /* Latency format */
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ print_graph_lat_fmt(s, ent);
}
/* No overhead */
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_START);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
+ print_graph_duration(0, s, flags | FLAGS_FILL_START);
if (type == TRACE_GRAPH_ENT)
- ret = trace_seq_puts(s, "==========>");
+ trace_seq_puts(s, "==========>");
else
- ret = trace_seq_puts(s, "<==========");
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_END);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
-
- ret = trace_seq_putc(s, '\n');
+ trace_seq_puts(s, "<==========");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- return TRACE_TYPE_HANDLED;
+ print_graph_duration(0, s, flags | FLAGS_FILL_END);
+ trace_seq_putc(s, '\n');
}
-enum print_line_t
+void
trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
{
unsigned long nsecs_rem = do_div(duration, 1000);
/* log10(ULONG_MAX) + '\0' */
- char msecs_str[21];
+ char usecs_str[21];
char nsecs_str[5];
- int ret, len;
+ int len;
int i;
- sprintf(msecs_str, "%lu", (unsigned long) duration);
+ sprintf(usecs_str, "%lu", (unsigned long) duration);
/* Print msecs */
- ret = trace_seq_printf(s, "%s", msecs_str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s", usecs_str);
- len = strlen(msecs_str);
+ len = strlen(usecs_str);
/* Print nsecs (we don't want to exceed 7 numbers) */
if (len < 7) {
size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
- ret = trace_seq_printf(s, ".%s", nsecs_str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, ".%s", nsecs_str);
len += strlen(nsecs_str);
}
- ret = trace_seq_puts(s, " us ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " us ");
/* Print remaining spaces to fit the row's width */
- for (i = len; i < 7; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- return TRACE_TYPE_HANDLED;
+ for (i = len; i < 7; i++)
+ trace_seq_putc(s, ' ');
}
-static enum print_line_t
+static void
print_graph_duration(unsigned long long duration, struct trace_seq *s,
u32 flags)
{
- int ret = -1;
-
if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
!(trace_flags & TRACE_ITER_CONTEXT_INFO))
- return TRACE_TYPE_HANDLED;
+ return;
/* No real adata, just filling the column with spaces */
switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) {
case FLAGS_FILL_FULL:
- ret = trace_seq_puts(s, " | ");
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " | ");
+ return;
case FLAGS_FILL_START:
- ret = trace_seq_puts(s, " ");
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " ");
+ return;
case FLAGS_FILL_END:
- ret = trace_seq_puts(s, " |");
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " |");
+ return;
}
/* Signal a overhead of time execution to the output */
- if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
- /* Duration exceeded 100 msecs */
- if (duration > 100000ULL)
- ret = trace_seq_puts(s, "! ");
- /* Duration exceeded 10 msecs */
- else if (duration > 10000ULL)
- ret = trace_seq_puts(s, "+ ");
- }
-
- /*
- * The -1 means we either did not exceed the duration tresholds
- * or we dont want to print out the overhead. Either way we need
- * to fill out the space.
- */
- if (ret == -1)
- ret = trace_seq_puts(s, " ");
-
- /* Catching here any failure happenned above */
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_print_graph_duration(duration, s);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
-
- ret = trace_seq_puts(s, "| ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ if (flags & TRACE_GRAPH_PRINT_OVERHEAD)
+ trace_seq_printf(s, "%c ", trace_find_mark(duration));
+ else
+ trace_seq_puts(s, " ");
- return TRACE_TYPE_HANDLED;
+ trace_print_graph_duration(duration, s);
+ trace_seq_puts(s, "| ");
}
/* Case of a leaf function on its call entry */
@@ -864,7 +768,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ret *graph_ret;
struct ftrace_graph_ent *call;
unsigned long long duration;
- int ret;
int i;
graph_ret = &ret_entry->ret;
@@ -890,22 +793,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
}
/* Overhead and duration */
- ret = print_graph_duration(duration, s, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_duration(duration, s, flags);
/* Function */
- for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
- ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%ps();\n", (void *)call->func);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -915,7 +811,6 @@ print_graph_entry_nested(struct trace_iterator *iter,
{
struct ftrace_graph_ent *call = &entry->graph_ent;
struct fgraph_data *data = iter->private;
- int ret;
int i;
if (data) {
@@ -931,19 +826,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
}
/* No time */
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
+ print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
/* Function */
- for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
+
+ trace_seq_printf(s, "%ps() {\n", (void *)call->func);
- ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
- if (!ret)
+ if (trace_seq_has_overflowed(s))
return TRACE_TYPE_PARTIAL_LINE;
/*
@@ -953,62 +844,43 @@ print_graph_entry_nested(struct trace_iterator *iter,
return TRACE_TYPE_NO_CONSUME;
}
-static enum print_line_t
+static void
print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
int type, unsigned long addr, u32 flags)
{
struct fgraph_data *data = iter->private;
struct trace_entry *ent = iter->ent;
int cpu = iter->cpu;
- int ret;
/* Pid */
- if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ verif_pid(s, ent->pid, cpu, data);
- if (type) {
+ if (type)
/* Interrupt */
- ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
- return 0;
+ return;
/* Absolute time */
- if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
- ret = print_graph_abs_time(iter->ts, s);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
+ print_graph_abs_time(iter->ts, s);
/* Cpu */
- if (flags & TRACE_GRAPH_PRINT_CPU) {
- ret = print_graph_cpu(s, cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ print_graph_cpu(s, cpu);
/* Proc */
if (flags & TRACE_GRAPH_PRINT_PROC) {
- ret = print_graph_proc(s, ent->pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_puts(s, " | ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_proc(s, ent->pid);
+ trace_seq_puts(s, " | ");
}
/* Latency format */
- if (trace_flags & TRACE_ITER_LATENCY_FMT) {
- ret = print_graph_lat_fmt(s, ent);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ print_graph_lat_fmt(s, ent);
- return 0;
+ return;
}
/*
@@ -1126,8 +998,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
if (check_irq_entry(iter, flags, call->func, call->depth))
return TRACE_TYPE_HANDLED;
- if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags);
leaf_ret = get_return_for_leaf(iter, field);
if (leaf_ret)
@@ -1160,7 +1031,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
pid_t pid = ent->pid;
int cpu = iter->cpu;
int func_match = 1;
- int ret;
int i;
if (check_irq_return(iter, flags, trace->depth))
@@ -1186,20 +1056,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
}
}
- if (print_graph_prologue(iter, s, 0, 0, flags))
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_prologue(iter, s, 0, 0, flags);
/* Overhead and duration */
- ret = print_graph_duration(duration, s, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_duration(duration, s, flags);
/* Closing brace */
- for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
/*
* If the return function does not have a matching entry,
@@ -1208,30 +1072,20 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
* belongs to, write out the function name. Always do
* that if the funcgraph-tail option is enabled.
*/
- if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) {
- ret = trace_seq_puts(s, "}\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- } else {
- ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
+ trace_seq_puts(s, "}\n");
+ else
+ trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
/* Overrun */
- if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
- ret = trace_seq_printf(s, " (Overruns: %lu)\n",
- trace->overrun);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_OVERRUN)
+ trace_seq_printf(s, " (Overruns: %lu)\n",
+ trace->overrun);
- ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
- cpu, pid, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
+ cpu, pid, flags);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -1248,26 +1102,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
if (data)
depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
- if (print_graph_prologue(iter, s, 0, 0, flags))
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_prologue(iter, s, 0, 0, flags);
/* No time */
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
+ print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
/* Indentation */
if (depth > 0)
- for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
/* The comment */
- ret = trace_seq_puts(s, "/* ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, "/* ");
switch (iter->ent->type) {
case TRACE_BPRINT:
@@ -1290,17 +1136,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
return ret;
}
+ if (trace_seq_has_overflowed(s))
+ goto out;
+
/* Strip ending newline */
- if (s->buffer[s->len - 1] == '\n') {
- s->buffer[s->len - 1] = '\0';
- s->len--;
+ if (s->buffer[s->seq.len - 1] == '\n') {
+ s->buffer[s->seq.len - 1] = '\0';
+ s->seq.len--;
}
- ret = trace_seq_puts(s, " */\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_puts(s, " */\n");
+ out:
+ return trace_handle_return(s);
}
@@ -1407,32 +1254,32 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
print_lat_header(s, flags);
/* 1st line */
- seq_printf(s, "#");
+ seq_putc(s, '#');
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
- seq_printf(s, " TIME ");
+ seq_puts(s, " TIME ");
if (flags & TRACE_GRAPH_PRINT_CPU)
- seq_printf(s, " CPU");
+ seq_puts(s, " CPU");
if (flags & TRACE_GRAPH_PRINT_PROC)
- seq_printf(s, " TASK/PID ");
+ seq_puts(s, " TASK/PID ");
if (lat)
- seq_printf(s, "||||");
+ seq_puts(s, "||||");
if (flags & TRACE_GRAPH_PRINT_DURATION)
- seq_printf(s, " DURATION ");
- seq_printf(s, " FUNCTION CALLS\n");
+ seq_puts(s, " DURATION ");
+ seq_puts(s, " FUNCTION CALLS\n");
/* 2nd line */
- seq_printf(s, "#");
+ seq_putc(s, '#');
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
- seq_printf(s, " | ");
+ seq_puts(s, " | ");
if (flags & TRACE_GRAPH_PRINT_CPU)
- seq_printf(s, " | ");
+ seq_puts(s, " | ");
if (flags & TRACE_GRAPH_PRINT_PROC)
- seq_printf(s, " | | ");
+ seq_puts(s, " | | ");
if (lat)
- seq_printf(s, "||||");
+ seq_puts(s, "||||");
if (flags & TRACE_GRAPH_PRINT_DURATION)
- seq_printf(s, " | | ");
- seq_printf(s, " | | | |\n");
+ seq_puts(s, " | | ");
+ seq_puts(s, " | | | |\n");
}
static void print_graph_headers(struct seq_file *s)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index bd90e1b06088..3ccf5c2c1320 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -20,10 +20,12 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
{
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
+ static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
unsigned int old_userobj;
int cnt = 0, cpu;
trace_init_global_iter(&iter);
+ iter.buffer_iter = buffer_iter;
for_each_tracing_cpu(cpu) {
atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
@@ -57,19 +59,19 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
}
- if (!trace_empty(&iter))
- trace_find_next_entry_inc(&iter);
- while (!trace_empty(&iter)) {
+
+ while (trace_find_next_entry_inc(&iter)) {
if (!cnt)
kdb_printf("---------------------------------\n");
cnt++;
- if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
+ if (!skip_lines) {
print_trace_line(&iter);
- if (!skip_lines)
trace_printk_seq(&iter.seq);
- else
+ } else {
skip_lines--;
+ }
+
if (KDB_FLAG(CMD_INTERRUPT))
goto out;
}
@@ -86,9 +88,12 @@ out:
atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
- for_each_tracing_cpu(cpu)
- if (iter.buffer_iter[cpu])
+ for_each_tracing_cpu(cpu) {
+ if (iter.buffer_iter[cpu]) {
ring_buffer_read_finish(iter.buffer_iter[cpu]);
+ iter.buffer_iter[cpu] = NULL;
+ }
+ }
}
/*
@@ -127,8 +132,8 @@ static int kdb_ftdump(int argc, const char **argv)
static __init int kdb_ftrace_register(void)
{
- kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
- "Dump ftrace log", 0, KDB_REPEAT_NONE);
+ kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
+ "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE);
return 0;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 282f6e4e5539..5edb518be345 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -826,7 +826,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
struct trace_kprobe *tk = v;
int i;
- seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p');
+ seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
seq_printf(m, ":%s/%s", tk->tp.call.class->system,
ftrace_event_name(&tk->tp.call));
@@ -840,7 +840,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
for (i = 0; i < tk->tp.nr_args; i++)
seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
@@ -1024,27 +1024,22 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
field = (struct kprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
- goto partial;
+ trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, ")"))
- goto partial;
+ trace_seq_putc(s, ')');
data = (u8 *)&field[1];
for (i = 0; i < tp->nr_args; i++)
if (!tp->args[i].type->print(s, tp->args[i].name,
data + tp->args[i].offset, field))
- goto partial;
-
- if (!trace_seq_puts(s, "\n"))
- goto partial;
+ goto out;
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_putc(s, '\n');
+ out:
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -1060,33 +1055,28 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
field = (struct kretprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
- goto partial;
+ trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, " <- "))
- goto partial;
+ trace_seq_puts(s, " <- ");
if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, ")"))
- goto partial;
+ trace_seq_putc(s, ')');
data = (u8 *)&field[1];
for (i = 0; i < tp->nr_args; i++)
if (!tp->args[i].type->print(s, tp->args[i].name,
data + tp->args[i].offset, field))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, "\n"))
- goto partial;
+ trace_seq_putc(s, '\n');
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ out:
+ return trace_handle_return(s);
}
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0abd9b863474..7a9ba62e9fef 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -59,17 +59,15 @@ static void mmio_trace_start(struct trace_array *tr)
mmio_reset_data(tr);
}
-static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
{
- int ret = 0;
int i;
resource_size_t start, end;
const struct pci_driver *drv = pci_dev_driver(dev);
- /* XXX: incomplete checks for trace_seq_printf() return value */
- ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
- dev->bus->number, dev->devfn,
- dev->vendor, dev->device, dev->irq);
+ trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+ dev->bus->number, dev->devfn,
+ dev->vendor, dev->device, dev->irq);
/*
* XXX: is pci_resource_to_user() appropriate, since we are
* supposed to interpret the __ioremap() phys_addr argument based on
@@ -77,21 +75,20 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
*/
for (i = 0; i < 7; i++) {
pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
- ret += trace_seq_printf(s, " %llx",
+ trace_seq_printf(s, " %llx",
(unsigned long long)(start |
(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
}
for (i = 0; i < 7; i++) {
pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
- ret += trace_seq_printf(s, " %llx",
+ trace_seq_printf(s, " %llx",
dev->resource[i].start < dev->resource[i].end ?
(unsigned long long)(end - start) + 1 : 0);
}
if (drv)
- ret += trace_seq_printf(s, " %s\n", drv->name);
+ trace_seq_printf(s, " %s\n", drv->name);
else
- ret += trace_seq_puts(s, " \n");
- return ret;
+ trace_seq_puts(s, " \n");
}
static void destroy_header_iter(struct header_iter *hiter)
@@ -179,28 +176,27 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
- int ret = 1;
trace_assign_type(field, entry);
rw = &field->rw;
switch (rw->opcode) {
case MMIO_READ:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_WRITE:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_UNKNOWN_OP:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
"%02lx 0x%lx %d\n",
secs, usec_rem, rw->map_id,
@@ -209,12 +205,11 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
(rw->value >> 0) & 0xff, rw->pc, 0);
break;
default:
- ret = trace_seq_puts(s, "rw what?\n");
+ trace_seq_puts(s, "rw what?\n");
break;
}
- if (ret)
- return TRACE_TYPE_HANDLED;
- return TRACE_TYPE_PARTIAL_LINE;
+
+ return trace_handle_return(s);
}
static enum print_line_t mmio_print_map(struct trace_iterator *iter)
@@ -226,31 +221,29 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
- int ret;
trace_assign_type(field, entry);
m = &field->map;
switch (m->opcode) {
case MMIO_PROBE:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
secs, usec_rem, m->map_id,
(unsigned long long)m->phys, m->virt, m->len,
0UL, 0);
break;
case MMIO_UNPROBE:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"UNMAP %u.%06lu %d 0x%lx %d\n",
secs, usec_rem, m->map_id, 0UL, 0);
break;
default:
- ret = trace_seq_puts(s, "map what?\n");
+ trace_seq_puts(s, "map what?\n");
break;
}
- if (ret)
- return TRACE_TYPE_HANDLED;
- return TRACE_TYPE_PARTIAL_LINE;
+
+ return trace_handle_return(s);
}
static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
@@ -262,14 +255,11 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
- int ret;
/* The trailing newline must be in the message. */
- ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t mmio_print_line(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c6977d5a9b12..b77b9a697619 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -25,15 +25,12 @@ enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
struct bputs_entry *field;
- int ret;
trace_assign_type(field, entry);
- ret = trace_seq_puts(s, field->str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, field->str);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -41,15 +38,12 @@ enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
struct bprint_entry *field;
- int ret;
trace_assign_type(field, entry);
- ret = trace_seq_bprintf(s, field->fmt, field->buf);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_bprintf(s, field->fmt, field->buf);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
@@ -57,15 +51,12 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
struct print_entry *field;
- int ret;
trace_assign_type(field, entry);
- ret = trace_seq_puts(s, field->buf);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, field->buf);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
const char *
@@ -124,7 +115,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
if (ret == (const char *)(trace_seq_buffer_ptr(p)))
trace_seq_printf(p, "0x%lx", val);
-
+
trace_seq_putc(p, 0);
return ret;
@@ -193,7 +184,6 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
struct trace_seq *p = &iter->tmp_seq;
struct trace_entry *entry;
- int ret;
event = container_of(trace_event, struct ftrace_event_call, event);
entry = iter->ent;
@@ -204,11 +194,9 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
}
trace_seq_init(p);
- ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event));
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s: ", ftrace_event_name(event));
- return 0;
+ return trace_handle_return(s);
}
EXPORT_SYMBOL(ftrace_raw_output_prep);
@@ -216,18 +204,11 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name,
char *fmt, va_list ap)
{
struct trace_seq *s = &iter->seq;
- int ret;
-
- ret = trace_seq_printf(s, "%s: ", name);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_vprintf(s, fmt, ap);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s: ", name);
+ trace_seq_vprintf(s, fmt, ap);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
@@ -260,7 +241,7 @@ static inline const char *kretprobed(const char *name)
}
#endif /* CONFIG_KRETPROBES */
-static int
+static void
seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
{
#ifdef CONFIG_KALLSYMS
@@ -271,12 +252,11 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
name = kretprobed(str);
- return trace_seq_printf(s, fmt, name);
+ trace_seq_printf(s, fmt, name);
#endif
- return 1;
}
-static int
+static void
seq_print_sym_offset(struct trace_seq *s, const char *fmt,
unsigned long address)
{
@@ -287,9 +267,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
sprint_symbol(str, address);
name = kretprobed(str);
- return trace_seq_printf(s, fmt, name);
+ trace_seq_printf(s, fmt, name);
#endif
- return 1;
}
#ifndef CONFIG_64BIT
@@ -320,14 +299,14 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
if (file) {
ret = trace_seq_path(s, &file->f_path);
if (ret)
- ret = trace_seq_printf(s, "[+0x%lx]",
- ip - vmstart);
+ trace_seq_printf(s, "[+0x%lx]",
+ ip - vmstart);
}
up_read(&mm->mmap_sem);
}
if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
- ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
- return ret;
+ trace_seq_printf(s, " <" IP_FMT ">", ip);
+ return !trace_seq_has_overflowed(s);
}
int
@@ -335,7 +314,6 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
unsigned long sym_flags)
{
struct mm_struct *mm = NULL;
- int ret = 1;
unsigned int i;
if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
@@ -354,48 +332,45 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
unsigned long ip = entry->caller[i];
- if (ip == ULONG_MAX || !ret)
+ if (ip == ULONG_MAX || trace_seq_has_overflowed(s))
break;
- if (ret)
- ret = trace_seq_puts(s, " => ");
+
+ trace_seq_puts(s, " => ");
+
if (!ip) {
- if (ret)
- ret = trace_seq_puts(s, "??");
- if (ret)
- ret = trace_seq_putc(s, '\n');
+ trace_seq_puts(s, "??");
+ trace_seq_putc(s, '\n');
continue;
}
- if (!ret)
- break;
- if (ret)
- ret = seq_print_user_ip(s, mm, ip, sym_flags);
- ret = trace_seq_putc(s, '\n');
+
+ seq_print_user_ip(s, mm, ip, sym_flags);
+ trace_seq_putc(s, '\n');
}
if (mm)
mmput(mm);
- return ret;
+
+ return !trace_seq_has_overflowed(s);
}
int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
{
- int ret;
-
- if (!ip)
- return trace_seq_putc(s, '0');
+ if (!ip) {
+ trace_seq_putc(s, '0');
+ goto out;
+ }
if (sym_flags & TRACE_ITER_SYM_OFFSET)
- ret = seq_print_sym_offset(s, "%s", ip);
+ seq_print_sym_offset(s, "%s", ip);
else
- ret = seq_print_sym_short(s, "%s", ip);
-
- if (!ret)
- return 0;
+ seq_print_sym_short(s, "%s", ip);
if (sym_flags & TRACE_ITER_SYM_ADDR)
- ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
- return ret;
+ trace_seq_printf(s, " <" IP_FMT ">", ip);
+
+ out:
+ return !trace_seq_has_overflowed(s);
}
/**
@@ -413,7 +388,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
char irqs_off;
int hardirq;
int softirq;
- int ret;
hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
@@ -445,16 +419,15 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
softirq ? 's' :
'.';
- if (!trace_seq_printf(s, "%c%c%c",
- irqs_off, need_resched, hardsoft_irq))
- return 0;
+ trace_seq_printf(s, "%c%c%c",
+ irqs_off, need_resched, hardsoft_irq);
if (entry->preempt_count)
- ret = trace_seq_printf(s, "%x", entry->preempt_count);
+ trace_seq_printf(s, "%x", entry->preempt_count);
else
- ret = trace_seq_putc(s, '.');
+ trace_seq_putc(s, '.');
- return ret;
+ return !trace_seq_has_overflowed(s);
}
static int
@@ -464,14 +437,38 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
trace_find_cmdline(entry->pid, comm);
- if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
- comm, entry->pid, cpu))
- return 0;
+ trace_seq_printf(s, "%8.8s-%-5d %3d",
+ comm, entry->pid, cpu);
return trace_print_lat_fmt(s, entry);
}
-static unsigned long preempt_mark_thresh_us = 100;
+#undef MARK
+#define MARK(v, s) {.val = v, .sym = s}
+/* trace overhead mark */
+static const struct trace_mark {
+ unsigned long long val; /* unit: nsec */
+ char sym;
+} mark[] = {
+ MARK(1000000000ULL , '$'), /* 1 sec */
+ MARK(1000000ULL , '#'), /* 1000 usecs */
+ MARK(100000ULL , '!'), /* 100 usecs */
+ MARK(10000ULL , '+'), /* 10 usecs */
+};
+#undef MARK
+
+char trace_find_mark(unsigned long long d)
+{
+ int i;
+ int size = ARRAY_SIZE(mark);
+
+ for (i = 0; i < size; i++) {
+ if (d >= mark[i].val)
+ break;
+ }
+
+ return (i == size) ? ' ' : mark[i].sym;
+}
static int
lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
@@ -493,24 +490,28 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
unsigned long rel_msec = (unsigned long)rel_ts;
- return trace_seq_printf(
- s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
- ns2usecs(iter->ts),
- abs_msec, abs_usec,
- rel_msec, rel_usec);
+ trace_seq_printf(
+ s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
+ ns2usecs(iter->ts),
+ abs_msec, abs_usec,
+ rel_msec, rel_usec);
+
} else if (verbose && !in_ns) {
- return trace_seq_printf(
- s, "[%016llx] %lld (+%lld): ",
- iter->ts, abs_ts, rel_ts);
+ trace_seq_printf(
+ s, "[%016llx] %lld (+%lld): ",
+ iter->ts, abs_ts, rel_ts);
+
} else if (!verbose && in_ns) {
- return trace_seq_printf(
- s, " %4lldus%c: ",
- abs_ts,
- rel_ts > preempt_mark_thresh_us ? '!' :
- rel_ts > 1 ? '+' : ' ');
+ trace_seq_printf(
+ s, " %4lldus%c: ",
+ abs_ts,
+ trace_find_mark(rel_ts * NSEC_PER_USEC));
+
} else { /* !verbose && !in_ns */
- return trace_seq_printf(s, " %4lld: ", abs_ts);
+ trace_seq_printf(s, " %4lld: ", abs_ts);
}
+
+ return !trace_seq_has_overflowed(s);
}
int trace_print_context(struct trace_iterator *iter)
@@ -520,34 +521,29 @@ int trace_print_context(struct trace_iterator *iter)
unsigned long long t;
unsigned long secs, usec_rem;
char comm[TASK_COMM_LEN];
- int ret;
trace_find_cmdline(entry->pid, comm);
- ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
+ trace_seq_printf(s, "%16s-%-5d [%03d] ",
comm, entry->pid, iter->cpu);
- if (!ret)
- return 0;
- if (trace_flags & TRACE_ITER_IRQ_INFO) {
- ret = trace_print_lat_fmt(s, entry);
- if (!ret)
- return 0;
- }
+ if (trace_flags & TRACE_ITER_IRQ_INFO)
+ trace_print_lat_fmt(s, entry);
if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
t = ns2usecs(iter->ts);
usec_rem = do_div(t, USEC_PER_SEC);
secs = (unsigned long)t;
- return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
+ trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
} else
- return trace_seq_printf(s, " %12llu: ", iter->ts);
+ trace_seq_printf(s, " %12llu: ", iter->ts);
+
+ return !trace_seq_has_overflowed(s);
}
int trace_print_lat_context(struct trace_iterator *iter)
{
u64 next_ts;
- int ret;
/* trace_find_next_entry will reset ent_size */
int ent_size = iter->ent_size;
struct trace_seq *s = &iter->seq;
@@ -567,18 +563,17 @@ int trace_print_lat_context(struct trace_iterator *iter)
trace_find_cmdline(entry->pid, comm);
- ret = trace_seq_printf(
- s, "%16s %5d %3d %d %08x %08lx ",
- comm, entry->pid, iter->cpu, entry->flags,
- entry->preempt_count, iter->idx);
+ trace_seq_printf(
+ s, "%16s %5d %3d %d %08x %08lx ",
+ comm, entry->pid, iter->cpu, entry->flags,
+ entry->preempt_count, iter->idx);
} else {
- ret = lat_print_generic(s, entry, iter->cpu);
+ lat_print_generic(s, entry, iter->cpu);
}
- if (ret)
- ret = lat_print_timestamp(iter, next_ts);
+ lat_print_timestamp(iter, next_ts);
- return ret;
+ return !trace_seq_has_overflowed(s);
}
static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
@@ -692,7 +687,7 @@ int register_ftrace_event(struct trace_event *event)
goto out;
} else {
-
+
event->type = next_event_type++;
list = &ftrace_event_list;
}
@@ -764,10 +759,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
- if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type))
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(&iter->seq);
}
/* TRACE_FN */
@@ -779,24 +773,16 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
+ seq_print_ip_sym(s, field->ip, flags);
if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
- if (!trace_seq_puts(s, " <-"))
- goto partial;
- if (!seq_print_ip_sym(s,
- field->parent_ip,
- flags))
- goto partial;
+ trace_seq_puts(s, " <-");
+ seq_print_ip_sym(s, field->parent_ip, flags);
}
- if (!trace_seq_putc(s, '\n'))
- goto partial;
- return TRACE_TYPE_HANDLED;
+ trace_seq_putc(s, '\n');
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
@@ -806,12 +792,11 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(&iter->seq, "%lx %lx\n",
- field->ip,
- field->parent_ip))
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(&iter->seq, "%lx %lx\n",
+ field->ip,
+ field->parent_ip);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
@@ -822,10 +807,10 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- SEQ_PUT_HEX_FIELD_RET(s, field->ip);
- SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
+ SEQ_PUT_HEX_FIELD(s, field->ip);
+ SEQ_PUT_HEX_FIELD(s, field->parent_ip);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
@@ -836,10 +821,10 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- SEQ_PUT_FIELD_RET(s, field->ip);
- SEQ_PUT_FIELD_RET(s, field->parent_ip);
+ SEQ_PUT_FIELD(s, field->ip);
+ SEQ_PUT_FIELD(s, field->parent_ip);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_fn_funcs = {
@@ -868,18 +853,17 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
T = task_state_char(field->next_state);
S = task_state_char(field->prev_state);
trace_find_cmdline(field->next_pid, comm);
- if (!trace_seq_printf(&iter->seq,
- " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
- field->prev_pid,
- field->prev_prio,
- S, delim,
- field->next_cpu,
- field->next_pid,
- field->next_prio,
- T, comm))
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(&iter->seq,
+ " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+ field->prev_pid,
+ field->prev_prio,
+ S, delim,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T, comm);
+
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
@@ -904,17 +888,16 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
if (!S)
S = task_state_char(field->prev_state);
T = task_state_char(field->next_state);
- if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
- field->prev_pid,
- field->prev_prio,
- S,
- field->next_cpu,
- field->next_pid,
- field->next_prio,
- T))
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
+ field->prev_pid,
+ field->prev_prio,
+ S,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T);
+
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
@@ -942,15 +925,15 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
S = task_state_char(field->prev_state);
T = task_state_char(field->next_state);
- SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
- SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
- SEQ_PUT_HEX_FIELD_RET(s, S);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
- SEQ_PUT_HEX_FIELD_RET(s, T);
+ SEQ_PUT_HEX_FIELD(s, field->prev_pid);
+ SEQ_PUT_HEX_FIELD(s, field->prev_prio);
+ SEQ_PUT_HEX_FIELD(s, S);
+ SEQ_PUT_HEX_FIELD(s, field->next_cpu);
+ SEQ_PUT_HEX_FIELD(s, field->next_pid);
+ SEQ_PUT_HEX_FIELD(s, field->next_prio);
+ SEQ_PUT_HEX_FIELD(s, T);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
@@ -973,14 +956,15 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- SEQ_PUT_FIELD_RET(s, field->prev_pid);
- SEQ_PUT_FIELD_RET(s, field->prev_prio);
- SEQ_PUT_FIELD_RET(s, field->prev_state);
- SEQ_PUT_FIELD_RET(s, field->next_pid);
- SEQ_PUT_FIELD_RET(s, field->next_prio);
- SEQ_PUT_FIELD_RET(s, field->next_state);
+ SEQ_PUT_FIELD(s, field->prev_pid);
+ SEQ_PUT_FIELD(s, field->prev_prio);
+ SEQ_PUT_FIELD(s, field->prev_state);
+ SEQ_PUT_FIELD(s, field->next_cpu);
+ SEQ_PUT_FIELD(s, field->next_pid);
+ SEQ_PUT_FIELD(s, field->next_prio);
+ SEQ_PUT_FIELD(s, field->next_state);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_ctx_funcs = {
@@ -1020,23 +1004,19 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
end = (unsigned long *)((long)iter->ent + iter->ent_size);
- if (!trace_seq_puts(s, "<stack trace>\n"))
- goto partial;
+ trace_seq_puts(s, "<stack trace>\n");
for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
- if (!trace_seq_puts(s, " => "))
- goto partial;
- if (!seq_print_ip_sym(s, *p, flags))
- goto partial;
- if (!trace_seq_putc(s, '\n'))
- goto partial;
- }
+ if (trace_seq_has_overflowed(s))
+ break;
- return TRACE_TYPE_HANDLED;
+ trace_seq_puts(s, " => ");
+ seq_print_ip_sym(s, *p, flags);
+ trace_seq_putc(s, '\n');
+ }
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_stack_funcs = {
@@ -1057,16 +1037,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- if (!trace_seq_puts(s, "<user stack trace>\n"))
- goto partial;
-
- if (!seq_print_userip_objs(field, s, flags))
- goto partial;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_puts(s, "<user stack trace>\n");
+ seq_print_userip_objs(field, s, flags);
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_user_stack_funcs = {
@@ -1089,19 +1063,11 @@ trace_bputs_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, entry);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
+ seq_print_ip_sym(s, field->ip, flags);
+ trace_seq_puts(s, ": ");
+ trace_seq_puts(s, field->str);
- if (!trace_seq_puts(s, ": "))
- goto partial;
-
- if (!trace_seq_puts(s, field->str))
- goto partial;
-
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
@@ -1114,16 +1080,10 @@ trace_bputs_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(s, ": %lx : ", field->ip))
- goto partial;
-
- if (!trace_seq_puts(s, field->str))
- goto partial;
+ trace_seq_printf(s, ": %lx : ", field->ip);
+ trace_seq_puts(s, field->str);
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_bputs_funcs = {
@@ -1147,19 +1107,11 @@ trace_bprint_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, entry);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
-
- if (!trace_seq_puts(s, ": "))
- goto partial;
-
- if (!trace_seq_bprintf(s, field->fmt, field->buf))
- goto partial;
+ seq_print_ip_sym(s, field->ip, flags);
+ trace_seq_puts(s, ": ");
+ trace_seq_bprintf(s, field->fmt, field->buf);
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
@@ -1172,16 +1124,10 @@ trace_bprint_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(s, ": %lx : ", field->ip))
- goto partial;
-
- if (!trace_seq_bprintf(s, field->fmt, field->buf))
- goto partial;
+ trace_seq_printf(s, ": %lx : ", field->ip);
+ trace_seq_bprintf(s, field->fmt, field->buf);
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_bprint_funcs = {
@@ -1203,16 +1149,10 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
-
- if (!trace_seq_printf(s, ": %s", field->buf))
- goto partial;
+ seq_print_ip_sym(s, field->ip, flags);
+ trace_seq_printf(s, ": %s", field->buf);
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
@@ -1222,13 +1162,9 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
- goto partial;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(&iter->seq);
}
static struct trace_event_functions trace_print_funcs = {
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 80b25b585a70..8ef2c40efb3c 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -35,17 +35,11 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
extern int __unregister_ftrace_event(struct trace_event *event);
extern struct rw_semaphore trace_event_sem;
-#define SEQ_PUT_FIELD_RET(s, x) \
-do { \
- if (!trace_seq_putmem(s, &(x), sizeof(x))) \
- return TRACE_TYPE_PARTIAL_LINE; \
-} while (0)
-
-#define SEQ_PUT_HEX_FIELD_RET(s, x) \
-do { \
- if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
- return TRACE_TYPE_PARTIAL_LINE; \
-} while (0)
+#define SEQ_PUT_FIELD(s, x) \
+ trace_seq_putmem(s, &(x), sizeof(x))
+
+#define SEQ_PUT_HEX_FIELD(s, x) \
+ trace_seq_putmem_hex(s, &(x), sizeof(x))
#endif
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2900817ba65c..c4e70b6bd7fa 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -305,7 +305,7 @@ static int t_show(struct seq_file *m, void *v)
seq_puts(m, "\\t");
break;
case '\\':
- seq_puts(m, "\\");
+ seq_putc(m, '\\');
break;
case '"':
seq_puts(m, "\\\"");
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d4b9fc22cd27..b983b2fd2ca1 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -40,7 +40,8 @@ const char *reserved_field_names[] = {
int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
void *data, void *ent) \
{ \
- return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
+ trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
+ return !trace_seq_has_overflowed(s); \
} \
const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
@@ -61,10 +62,11 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
int len = *(u32 *)data >> 16;
if (!len)
- return trace_seq_printf(s, " %s=(fault)", name);
+ trace_seq_printf(s, " %s=(fault)", name);
else
- return trace_seq_printf(s, " %s=\"%s\"", name,
- (const char *)get_loc_data(data, ent));
+ trace_seq_printf(s, " %s=\"%s\"", name,
+ (const char *)get_loc_data(data, ent));
+ return !trace_seq_has_overflowed(s);
}
NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3f34dc9b40f3..2e293beb186e 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -14,122 +14,26 @@
#include "trace.h"
-static struct trace_array *ctx_trace;
-static int __read_mostly tracer_enabled;
static int sched_ref;
static DEFINE_MUTEX(sched_register_mutex);
-static int sched_stopped;
-
-
-void
-tracing_sched_switch_trace(struct trace_array *tr,
- struct task_struct *prev,
- struct task_struct *next,
- unsigned long flags, int pc)
-{
- struct ftrace_event_call *call = &event_context_switch;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
- struct ring_buffer_event *event;
- struct ctx_switch_entry *entry;
-
- event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
- sizeof(*entry), flags, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->prev_pid = prev->pid;
- entry->prev_prio = prev->prio;
- entry->prev_state = prev->state;
- entry->next_pid = next->pid;
- entry->next_prio = next->prio;
- entry->next_state = next->state;
- entry->next_cpu = task_cpu(next);
-
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, flags, pc);
-}
static void
probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
{
- struct trace_array_cpu *data;
- unsigned long flags;
- int cpu;
- int pc;
-
if (unlikely(!sched_ref))
return;
tracing_record_cmdline(prev);
tracing_record_cmdline(next);
-
- if (!tracer_enabled || sched_stopped)
- return;
-
- pc = preempt_count();
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
-
- if (likely(!atomic_read(&data->disabled)))
- tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
-
- local_irq_restore(flags);
-}
-
-void
-tracing_sched_wakeup_trace(struct trace_array *tr,
- struct task_struct *wakee,
- struct task_struct *curr,
- unsigned long flags, int pc)
-{
- struct ftrace_event_call *call = &event_wakeup;
- struct ring_buffer_event *event;
- struct ctx_switch_entry *entry;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
-
- event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
- sizeof(*entry), flags, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->prev_pid = curr->pid;
- entry->prev_prio = curr->prio;
- entry->prev_state = curr->state;
- entry->next_pid = wakee->pid;
- entry->next_prio = wakee->prio;
- entry->next_state = wakee->state;
- entry->next_cpu = task_cpu(wakee);
-
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, flags, pc);
}
static void
probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
{
- struct trace_array_cpu *data;
- unsigned long flags;
- int cpu, pc;
-
if (unlikely(!sched_ref))
return;
tracing_record_cmdline(current);
-
- if (!tracer_enabled || sched_stopped)
- return;
-
- pc = preempt_count();
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
-
- if (likely(!atomic_read(&data->disabled)))
- tracing_sched_wakeup_trace(ctx_trace, wakee, current,
- flags, pc);
-
- local_irq_restore(flags);
}
static int tracing_sched_register(void)
@@ -197,51 +101,3 @@ void tracing_stop_cmdline_record(void)
{
tracing_stop_sched_switch();
}
-
-/**
- * tracing_start_sched_switch_record - start tracing context switches
- *
- * Turns on context switch tracing for a tracer.
- */
-void tracing_start_sched_switch_record(void)
-{
- if (unlikely(!ctx_trace)) {
- WARN_ON(1);
- return;
- }
-
- tracing_start_sched_switch();
-
- mutex_lock(&sched_register_mutex);
- tracer_enabled++;
- mutex_unlock(&sched_register_mutex);
-}
-
-/**
- * tracing_stop_sched_switch_record - start tracing context switches
- *
- * Turns off context switch tracing for a tracer.
- */
-void tracing_stop_sched_switch_record(void)
-{
- mutex_lock(&sched_register_mutex);
- tracer_enabled--;
- WARN_ON(tracer_enabled < 0);
- mutex_unlock(&sched_register_mutex);
-
- tracing_stop_sched_switch();
-}
-
-/**
- * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
- * @tr: trace array pointer to assign
- *
- * Some tracers might want to record the context switches in their
- * trace. This function lets those tracers assign the trace array
- * to use.
- */
-void tracing_sched_switch_assign_trace(struct trace_array *tr)
-{
- ctx_trace = tr;
-}
-
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 19bd8928ce94..8fb84b362816 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -365,6 +365,62 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
wakeup_current_cpu = cpu;
}
+static void
+tracing_sched_switch_trace(struct trace_array *tr,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_context_switch;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = prev->pid;
+ entry->prev_prio = prev->prio;
+ entry->prev_state = prev->state;
+ entry->next_pid = next->pid;
+ entry->next_prio = next->prio;
+ entry->next_state = next->state;
+ entry->next_cpu = task_cpu(next);
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
+
+static void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+ struct task_struct *wakee,
+ struct task_struct *curr,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_wakeup;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = curr->pid;
+ entry->prev_prio = curr->prio;
+ entry->prev_state = curr->state;
+ entry->next_pid = wakee->pid;
+ entry->next_prio = wakee->prio;
+ entry->next_state = wakee->state;
+ entry->next_cpu = task_cpu(wakee);
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
+
static void notrace
probe_wakeup_sched_switch(void *ignore,
struct task_struct *prev, struct task_struct *next)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 5ef60499dc8e..b0f86ea77881 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -382,6 +382,8 @@ static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
/* check the trace buffer */
ret = trace_test_buffer(&tr->trace_buffer, &count);
+
+ ftrace_enabled = 1;
tracing_start();
/* we should only have one item */
@@ -679,6 +681,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
/* check the trace buffer */
ret = trace_test_buffer(&tr->trace_buffer, &count);
+
+ ftrace_enabled = 1;
trace->reset(tr);
tracing_start();
@@ -1025,6 +1029,12 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
#endif
#ifdef CONFIG_SCHED_TRACER
+
+struct wakeup_test_data {
+ struct completion is_ready;
+ int go;
+};
+
static int trace_wakeup_test_thread(void *data)
{
/* Make this a -deadline thread */
@@ -1034,51 +1044,56 @@ static int trace_wakeup_test_thread(void *data)
.sched_deadline = 10000000ULL,
.sched_period = 10000000ULL
};
- struct completion *x = data;
+ struct wakeup_test_data *x = data;
sched_setattr(current, &attr);
/* Make it know we have a new prio */
- complete(x);
+ complete(&x->is_ready);
/* now go to sleep and let the test wake us up */
set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ while (!x->go) {
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
- complete(x);
+ complete(&x->is_ready);
+
+ set_current_state(TASK_INTERRUPTIBLE);
/* we are awake, now wait to disappear */
while (!kthread_should_stop()) {
- /*
- * This will likely be the system top priority
- * task, do short sleeps to let others run.
- */
- msleep(100);
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
}
+ __set_current_state(TASK_RUNNING);
+
return 0;
}
-
int
trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
{
unsigned long save_max = tr->max_latency;
struct task_struct *p;
- struct completion is_ready;
+ struct wakeup_test_data data;
unsigned long count;
int ret;
- init_completion(&is_ready);
+ memset(&data, 0, sizeof(data));
+
+ init_completion(&data.is_ready);
/* create a -deadline thread */
- p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test");
+ p = kthread_run(trace_wakeup_test_thread, &data, "ftrace-test");
if (IS_ERR(p)) {
printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
return -1;
}
/* make sure the thread is running at -deadline policy */
- wait_for_completion(&is_ready);
+ wait_for_completion(&data.is_ready);
/* start the tracing */
ret = tracer_init(trace, tr);
@@ -1099,18 +1114,20 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
msleep(100);
}
- init_completion(&is_ready);
+ init_completion(&data.is_ready);
+
+ data.go = 1;
+ /* memory barrier is in the wake_up_process() */
wake_up_process(p);
/* Wait for the task to wake up */
- wait_for_completion(&is_ready);
+ wait_for_completion(&data.is_ready);
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
ret = trace_test_buffer(&tr->trace_buffer, NULL);
- printk("ret = %d\n", ret);
if (!ret)
ret = trace_test_buffer(&tr->max_buffer, &count);
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 1f24ed99dca2..f8b45d8792f9 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -27,10 +27,19 @@
#include <linux/trace_seq.h>
/* How much buffer is left on the trace_seq? */
-#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len)
+#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq)
/* How much buffer is written? */
-#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1))
+#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq)
+
+/*
+ * trace_seq should work with being initialized with 0s.
+ */
+static inline void __trace_seq_init(struct trace_seq *s)
+{
+ if (unlikely(!s->seq.size))
+ trace_seq_init(s);
+}
/**
* trace_print_seq - move the contents of trace_seq into a seq_file
@@ -43,10 +52,11 @@
*/
int trace_print_seq(struct seq_file *m, struct trace_seq *s)
{
- unsigned int len = TRACE_SEQ_BUF_USED(s);
int ret;
- ret = seq_write(m, s->buffer, len);
+ __trace_seq_init(s);
+
+ ret = seq_buf_print_seq(m, &s->seq);
/*
* Only reset this buffer if we successfully wrote to the
@@ -69,34 +79,26 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
* trace_seq_printf() is used to store strings into a special
* buffer (@s). Then the output may be either used by
* the sequencer or pulled into another buffer.
- *
- * Returns 1 if we successfully written all the contents to
- * the buffer.
- * Returns 0 if we the length to write is bigger than the
- * reserved buffer space. In this case, nothing gets written.
*/
-int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+void trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
{
- unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+ unsigned int save_len = s->seq.len;
va_list ap;
- int ret;
- if (s->full || !len)
- return 0;
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
va_start(ap, fmt);
- ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+ seq_buf_vprintf(&s->seq, fmt, ap);
va_end(ap);
/* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
s->full = 1;
- return 0;
}
-
- s->len += ret;
-
- return 1;
}
EXPORT_SYMBOL_GPL(trace_seq_printf);
@@ -107,25 +109,23 @@ EXPORT_SYMBOL_GPL(trace_seq_printf);
* @nmaskbits: The number of bits that are valid in @maskp
*
* Writes a ASCII representation of a bitmask string into @s.
- *
- * Returns 1 if we successfully written all the contents to
- * the buffer.
- * Returns 0 if we the length to write is bigger than the
- * reserved buffer space. In this case, nothing gets written.
*/
-int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
int nmaskbits)
{
- unsigned int len = TRACE_SEQ_BUF_LEFT(s);
- int ret;
+ unsigned int save_len = s->seq.len;
- if (s->full || !len)
- return 0;
+ if (s->full)
+ return;
- ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
- s->len += ret;
+ __trace_seq_init(s);
- return 1;
+ seq_buf_bitmask(&s->seq, maskp, nmaskbits);
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ }
}
EXPORT_SYMBOL_GPL(trace_seq_bitmask);
@@ -139,28 +139,23 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask);
* trace_seq_printf is used to store strings into a special
* buffer (@s). Then the output may be either used by
* the sequencer or pulled into another buffer.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
{
- unsigned int len = TRACE_SEQ_BUF_LEFT(s);
- int ret;
+ unsigned int save_len = s->seq.len;
- if (s->full || !len)
- return 0;
+ if (s->full)
+ return;
- ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+ __trace_seq_init(s);
+
+ seq_buf_vprintf(&s->seq, fmt, args);
/* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
s->full = 1;
- return 0;
}
-
- s->len += ret;
-
- return len;
}
EXPORT_SYMBOL_GPL(trace_seq_vprintf);
@@ -178,28 +173,24 @@ EXPORT_SYMBOL_GPL(trace_seq_vprintf);
*
* This function will take the format and the binary array and finish
* the conversion into the ASCII string within the buffer.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
{
- unsigned int len = TRACE_SEQ_BUF_LEFT(s);
- int ret;
+ unsigned int save_len = s->seq.len;
- if (s->full || !len)
- return 0;
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
- ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
+ seq_buf_bprintf(&s->seq, fmt, binary);
/* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
s->full = 1;
- return 0;
+ return;
}
-
- s->len += ret;
-
- return len;
}
EXPORT_SYMBOL_GPL(trace_seq_bprintf);
@@ -212,25 +203,22 @@ EXPORT_SYMBOL_GPL(trace_seq_bprintf);
* copy to user routines. This function records a simple string
* into a special buffer (@s) for later retrieval by a sequencer
* or other mechanism.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_puts(struct trace_seq *s, const char *str)
+void trace_seq_puts(struct trace_seq *s, const char *str)
{
unsigned int len = strlen(str);
if (s->full)
- return 0;
+ return;
+
+ __trace_seq_init(s);
if (len > TRACE_SEQ_BUF_LEFT(s)) {
s->full = 1;
- return 0;
+ return;
}
- memcpy(s->buffer + s->len, str, len);
- s->len += len;
-
- return len;
+ seq_buf_putmem(&s->seq, str, len);
}
EXPORT_SYMBOL_GPL(trace_seq_puts);
@@ -243,22 +231,20 @@ EXPORT_SYMBOL_GPL(trace_seq_puts);
* copy to user routines. This function records a simple charater
* into a special buffer (@s) for later retrieval by a sequencer
* or other mechanism.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_putc(struct trace_seq *s, unsigned char c)
+void trace_seq_putc(struct trace_seq *s, unsigned char c)
{
if (s->full)
- return 0;
+ return;
+
+ __trace_seq_init(s);
if (TRACE_SEQ_BUF_LEFT(s) < 1) {
s->full = 1;
- return 0;
+ return;
}
- s->buffer[s->len++] = c;
-
- return 1;
+ seq_buf_putc(&s->seq, c);
}
EXPORT_SYMBOL_GPL(trace_seq_putc);
@@ -271,29 +257,23 @@ EXPORT_SYMBOL_GPL(trace_seq_putc);
* There may be cases where raw memory needs to be written into the
* buffer and a strcpy() would not work. Using this function allows
* for such cases.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
+void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
{
if (s->full)
- return 0;
+ return;
+
+ __trace_seq_init(s);
if (len > TRACE_SEQ_BUF_LEFT(s)) {
s->full = 1;
- return 0;
+ return;
}
- memcpy(s->buffer + s->len, mem, len);
- s->len += len;
-
- return len;
+ seq_buf_putmem(&s->seq, mem, len);
}
EXPORT_SYMBOL_GPL(trace_seq_putmem);
-#define MAX_MEMHEX_BYTES 8U
-#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
-
/**
* trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
* @s: trace sequence descriptor
@@ -303,41 +283,31 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem);
* This is similar to trace_seq_putmem() except instead of just copying the
* raw memory into the buffer it writes its ASCII representation of it
* in hex characters.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+void trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
unsigned int len)
{
- unsigned char hex[HEX_CHARS];
- const unsigned char *data = mem;
- unsigned int start_len;
- int i, j;
- int cnt = 0;
+ unsigned int save_len = s->seq.len;
if (s->full)
- return 0;
+ return;
- while (len) {
- start_len = min(len, HEX_CHARS - 1);
-#ifdef __BIG_ENDIAN
- for (i = 0, j = 0; i < start_len; i++) {
-#else
- for (i = start_len-1, j = 0; i >= 0; i--) {
-#endif
- hex[j++] = hex_asc_hi(data[i]);
- hex[j++] = hex_asc_lo(data[i]);
- }
- if (WARN_ON_ONCE(j == 0 || j/2 > len))
- break;
-
- /* j increments twice per loop */
- len -= j / 2;
- hex[j++] = ' ';
-
- cnt += trace_seq_putmem(s, hex, j);
+ __trace_seq_init(s);
+
+ /* Each byte is represented by two chars */
+ if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) {
+ s->full = 1;
+ return;
+ }
+
+ /* The added spaces can still cause an overflow */
+ seq_buf_putmem_hex(&s->seq, mem, len);
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ return;
}
- return cnt;
}
EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
@@ -355,30 +325,27 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
*/
int trace_seq_path(struct trace_seq *s, const struct path *path)
{
- unsigned char *p;
+ unsigned int save_len = s->seq.len;
if (s->full)
return 0;
+ __trace_seq_init(s);
+
if (TRACE_SEQ_BUF_LEFT(s) < 1) {
s->full = 1;
return 0;
}
- p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
- if (!IS_ERR(p)) {
- p = mangle_path(s->buffer + s->len, p, "\n");
- if (p) {
- s->len = p - s->buffer;
- return 1;
- }
- } else {
- s->buffer[s->len++] = '?';
- return 1;
+ seq_buf_path(&s->seq, path, "\n");
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ return 0;
}
- s->full = 1;
- return 0;
+ return 1;
}
EXPORT_SYMBOL_GPL(trace_seq_path);
@@ -404,25 +371,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
*/
int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
{
- int len;
- int ret;
-
- if (!cnt)
- return 0;
-
- if (s->len <= s->readpos)
- return -EBUSY;
-
- len = s->len - s->readpos;
- if (cnt > len)
- cnt = len;
- ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
- if (ret == cnt)
- return -EFAULT;
-
- cnt -= ret;
-
- s->readpos += cnt;
- return cnt;
+ __trace_seq_init(s);
+ return seq_buf_to_user(&s->seq, ubuf, cnt);
}
EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8a4e5cb66a4c..16eddb308c33 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,7 +13,6 @@
#include <linux/sysctl.h>
#include <linux/init.h>
#include <linux/fs.h>
-#include <linux/magic.h>
#include <asm/setup.h>
@@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack)
i++;
}
- if ((current != &init_task &&
- *(end_of_stack(current)) != STACK_END_MAGIC)) {
+ if (task_stack_end_corrupted(current)) {
print_max_stack();
BUG();
}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 759d5e004517..c6ee36fcbf90 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -114,7 +114,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_entry *ent = iter->ent;
struct syscall_trace_enter *trace;
struct syscall_metadata *entry;
- int i, ret, syscall;
+ int i, syscall;
trace = (typeof(trace))ent;
syscall = trace->nr;
@@ -128,35 +128,28 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
goto end;
}
- ret = trace_seq_printf(s, "%s(", entry->name);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s(", entry->name);
for (i = 0; i < entry->nb_args; i++) {
+
+ if (trace_seq_has_overflowed(s))
+ goto end;
+
/* parameter types */
- if (trace_flags & TRACE_ITER_VERBOSE) {
- ret = trace_seq_printf(s, "%s ", entry->types[i]);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (trace_flags & TRACE_ITER_VERBOSE)
+ trace_seq_printf(s, "%s ", entry->types[i]);
+
/* parameter values */
- ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
- trace->args[i],
- i == entry->nb_args - 1 ? "" : ", ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s: %lx%s", entry->args[i],
+ trace->args[i],
+ i == entry->nb_args - 1 ? "" : ", ");
}
- ret = trace_seq_putc(s, ')');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
+ trace_seq_putc(s, ')');
end:
- ret = trace_seq_putc(s, '\n');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_putc(s, '\n');
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -168,7 +161,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
struct syscall_trace_exit *trace;
int syscall;
struct syscall_metadata *entry;
- int ret;
trace = (typeof(trace))ent;
syscall = trace->nr;
@@ -176,7 +168,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
if (!entry) {
trace_seq_putc(s, '\n');
- return TRACE_TYPE_HANDLED;
+ goto out;
}
if (entry->exit_event->event.type != ent->type) {
@@ -184,12 +176,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
return TRACE_TYPE_UNHANDLED;
}
- ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
+ trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
trace->ret);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- return TRACE_TYPE_HANDLED;
+ out:
+ return trace_handle_return(s);
}
extern char *__bad_type_size(void);
@@ -313,7 +304,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
int size;
syscall_nr = trace_get_syscall_nr(current, regs);
- if (syscall_nr < 0)
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
@@ -360,7 +351,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
int syscall_nr;
syscall_nr = trace_get_syscall_nr(current, regs);
- if (syscall_nr < 0)
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
@@ -425,7 +416,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
return;
mutex_lock(&syscall_trace_lock);
tr->sys_refcount_enter--;
- rcu_assign_pointer(tr->enter_syscall_files[num], NULL);
+ RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL);
if (!tr->sys_refcount_enter)
unregister_trace_sys_enter(ftrace_syscall_enter, tr);
mutex_unlock(&syscall_trace_lock);
@@ -463,7 +454,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
return;
mutex_lock(&syscall_trace_lock);
tr->sys_refcount_exit--;
- rcu_assign_pointer(tr->exit_syscall_files[num], NULL);
+ RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL);
if (!tr->sys_refcount_exit)
unregister_trace_sys_exit(ftrace_syscall_exit, tr);
mutex_unlock(&syscall_trace_lock);
@@ -523,7 +514,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
return (unsigned long)sys_call_table[nr];
}
-static int __init init_ftrace_syscalls(void)
+void __init init_ftrace_syscalls(void)
{
struct syscall_metadata *meta;
unsigned long addr;
@@ -533,7 +524,7 @@ static int __init init_ftrace_syscalls(void)
GFP_KERNEL);
if (!syscalls_metadata) {
WARN_ON(1);
- return -ENOMEM;
+ return;
}
for (i = 0; i < NR_syscalls; i++) {
@@ -545,10 +536,7 @@ static int __init init_ftrace_syscalls(void)
meta->syscall_nr = i;
syscalls_metadata[i] = meta;
}
-
- return 0;
}
-early_initcall(init_ftrace_syscalls);
#ifdef CONFIG_PERF_EVENTS
@@ -567,7 +555,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
int size;
syscall_nr = trace_get_syscall_nr(current, regs);
- if (syscall_nr < 0)
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
return;
@@ -641,7 +629,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
int size;
syscall_nr = trace_get_syscall_nr(current, regs);
- if (syscall_nr < 0)
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 33ff6a24b802..8520acc34b18 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -552,8 +552,7 @@ error:
return ret;
fail_address_parse:
- if (inode)
- iput(inode);
+ iput(inode);
pr_info("Failed to parse address or file.\n");
@@ -606,7 +605,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
for (i = 0; i < tu->tp.nr_args; i++)
seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
@@ -852,16 +851,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
tu = container_of(event, struct trace_uprobe, tp.call.event);
if (is_ret_probe(tu)) {
- if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
- ftrace_event_name(&tu->tp.call),
- entry->vaddr[1], entry->vaddr[0]))
- goto partial;
+ trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
+ ftrace_event_name(&tu->tp.call),
+ entry->vaddr[1], entry->vaddr[0]);
data = DATAOF_TRACE_ENTRY(entry, true);
} else {
- if (!trace_seq_printf(s, "%s: (0x%lx)",
- ftrace_event_name(&tu->tp.call),
- entry->vaddr[0]))
- goto partial;
+ trace_seq_printf(s, "%s: (0x%lx)",
+ ftrace_event_name(&tu->tp.call),
+ entry->vaddr[0]);
data = DATAOF_TRACE_ENTRY(entry, false);
}
@@ -869,14 +866,13 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
struct probe_arg *parg = &tu->tp.args[i];
if (!parg->type->print(s, parg->name, data + parg->offset, entry))
- goto partial;
+ goto out;
}
- if (trace_seq_puts(s, "\n"))
- return TRACE_TYPE_HANDLED;
+ trace_seq_putc(s, '\n');
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ out:
+ return trace_handle_return(s);
}
typedef bool (*filter_func_t)(struct uprobe_consumer *self,
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 602e5bbbceff..d58cc4d8f0d1 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!ns_capable(current_user_ns(), CAP_SETGID))
+ if (!may_setgroups())
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 394f70b17162..9586b670a5b2 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -14,7 +14,7 @@ static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
void user_return_notifier_register(struct user_return_notifier *urn)
{
set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
- hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list));
+ hlist_add_head(&urn->link, this_cpu_ptr(&return_notifier_list));
}
EXPORT_SYMBOL_GPL(user_return_notifier_register);
@@ -25,7 +25,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
void user_return_notifier_unregister(struct user_return_notifier *urn)
{
hlist_del(&urn->link);
- if (hlist_empty(&__get_cpu_var(return_notifier_list)))
+ if (hlist_empty(this_cpu_ptr(&return_notifier_list)))
clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
}
EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
diff --git a/kernel/user.c b/kernel/user.c
index 4efa39350e44..b069ccbfb0b0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -50,7 +50,11 @@ struct user_namespace init_user_ns = {
.count = ATOMIC_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
- .proc_inum = PROC_USER_INIT_INO,
+ .ns.inum = PROC_USER_INIT_INO,
+#ifdef CONFIG_USER_NS
+ .ns.ops = &userns_operations,
+#endif
+ .flags = USERNS_INIT_FLAGS,
#ifdef CONFIG_PERSISTENT_KEYRINGS
.persistent_keyring_register_sem =
__RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index aa312b0dc3ec..4109f8320684 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -24,6 +24,7 @@
#include <linux/fs_struct.h>
static struct kmem_cache *user_ns_cachep __read_mostly;
+static DEFINE_MUTEX(userns_state_mutex);
static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
@@ -86,11 +87,12 @@ int create_user_ns(struct cred *new)
if (!ns)
return -ENOMEM;
- ret = proc_alloc_inum(&ns->proc_inum);
+ ret = ns_alloc_inum(&ns->ns);
if (ret) {
kmem_cache_free(user_ns_cachep, ns);
return ret;
}
+ ns->ns.ops = &userns_operations;
atomic_set(&ns->count, 1);
/* Leave the new->user_ns reference with the new user namespace. */
@@ -99,6 +101,11 @@ int create_user_ns(struct cred *new)
ns->owner = owner;
ns->group = group;
+ /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
+ mutex_lock(&userns_state_mutex);
+ ns->flags = parent_ns->flags;
+ mutex_unlock(&userns_state_mutex);
+
set_cred_user_ns(new, ns);
#ifdef CONFIG_PERSISTENT_KEYRINGS
@@ -136,7 +143,7 @@ void free_user_ns(struct user_namespace *ns)
#ifdef CONFIG_PERSISTENT_KEYRINGS
key_put(ns->persistent_keyring_register);
#endif
- proc_free_inum(ns->proc_inum);
+ ns_free_inum(&ns->ns);
kmem_cache_free(user_ns_cachep, ns);
ns = parent;
} while (atomic_dec_and_test(&parent->count));
@@ -583,9 +590,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
return false;
}
-
-static DEFINE_MUTEX(id_map_mutex);
-
static ssize_t map_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos,
int cap_setid,
@@ -602,7 +606,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ssize_t ret = -EINVAL;
/*
- * The id_map_mutex serializes all writes to any given map.
+ * The userns_state_mutex serializes all writes to any given map.
*
* Any map is only ever written once.
*
@@ -620,7 +624,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
* order and smp_rmb() is guaranteed that we don't have crazy
* architectures returning stale data.
*/
- mutex_lock(&id_map_mutex);
+ mutex_lock(&userns_state_mutex);
ret = -EPERM;
/* Only allow one successful write to the map */
@@ -640,7 +644,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (!page)
goto out;
- /* Only allow <= page size writes at the beginning of the file */
+ /* Only allow < page size writes at the beginning of the file */
ret = -EINVAL;
if ((*ppos != 0) || (count >= PAGE_SIZE))
goto out;
@@ -750,7 +754,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
*ppos = count;
ret = count;
out:
- mutex_unlock(&id_map_mutex);
+ mutex_unlock(&userns_state_mutex);
if (page)
free_page(page);
return ret;
@@ -812,16 +816,21 @@ static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
struct uid_gid_map *new_map)
{
- /* Allow mapping to your own filesystem ids */
- if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
+ const struct cred *cred = file->f_cred;
+ /* Don't allow mappings that would allow anything that wouldn't
+ * be allowed without the establishment of unprivileged mappings.
+ */
+ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
+ uid_eq(ns->owner, cred->euid)) {
u32 id = new_map->extent[0].lower_first;
if (cap_setid == CAP_SETUID) {
kuid_t uid = make_kuid(ns->parent, id);
- if (uid_eq(uid, file->f_cred->fsuid))
+ if (uid_eq(uid, cred->euid))
return true;
} else if (cap_setid == CAP_SETGID) {
kgid_t gid = make_kgid(ns->parent, id);
- if (gid_eq(gid, file->f_cred->fsgid))
+ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
+ gid_eq(gid, cred->egid))
return true;
}
}
@@ -841,7 +850,106 @@ static bool new_idmap_permitted(const struct file *file,
return false;
}
-static void *userns_get(struct task_struct *task)
+int proc_setgroups_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ unsigned long userns_flags = ACCESS_ONCE(ns->flags);
+
+ seq_printf(seq, "%s\n",
+ (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
+ "allow" : "deny");
+ return 0;
+}
+
+ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ char kbuf[8], *pos;
+ bool setgroups_allowed;
+ ssize_t ret;
+
+ /* Only allow a very narrow range of strings to be written */
+ ret = -EINVAL;
+ if ((*ppos != 0) || (count >= sizeof(kbuf)))
+ goto out;
+
+ /* What was written? */
+ ret = -EFAULT;
+ if (copy_from_user(kbuf, buf, count))
+ goto out;
+ kbuf[count] = '\0';
+ pos = kbuf;
+
+ /* What is being requested? */
+ ret = -EINVAL;
+ if (strncmp(pos, "allow", 5) == 0) {
+ pos += 5;
+ setgroups_allowed = true;
+ }
+ else if (strncmp(pos, "deny", 4) == 0) {
+ pos += 4;
+ setgroups_allowed = false;
+ }
+ else
+ goto out;
+
+ /* Verify there is not trailing junk on the line */
+ pos = skip_spaces(pos);
+ if (*pos != '\0')
+ goto out;
+
+ ret = -EPERM;
+ mutex_lock(&userns_state_mutex);
+ if (setgroups_allowed) {
+ /* Enabling setgroups after setgroups has been disabled
+ * is not allowed.
+ */
+ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
+ goto out_unlock;
+ } else {
+ /* Permanently disabling setgroups after setgroups has
+ * been enabled by writing the gid_map is not allowed.
+ */
+ if (ns->gid_map.nr_extents != 0)
+ goto out_unlock;
+ ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
+ }
+ mutex_unlock(&userns_state_mutex);
+
+ /* Report a successful write */
+ *ppos = count;
+ ret = count;
+out:
+ return ret;
+out_unlock:
+ mutex_unlock(&userns_state_mutex);
+ goto out;
+}
+
+bool userns_may_setgroups(const struct user_namespace *ns)
+{
+ bool allowed;
+
+ mutex_lock(&userns_state_mutex);
+ /* It is not safe to use setgroups until a gid mapping in
+ * the user namespace has been established.
+ */
+ allowed = ns->gid_map.nr_extents != 0;
+ /* Is setgroups allowed? */
+ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
+ mutex_unlock(&userns_state_mutex);
+
+ return allowed;
+}
+
+static inline struct user_namespace *to_user_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct user_namespace, ns);
+}
+
+static struct ns_common *userns_get(struct task_struct *task)
{
struct user_namespace *user_ns;
@@ -849,17 +957,17 @@ static void *userns_get(struct task_struct *task)
user_ns = get_user_ns(__task_cred(task)->user_ns);
rcu_read_unlock();
- return user_ns;
+ return user_ns ? &user_ns->ns : NULL;
}
-static void userns_put(void *ns)
+static void userns_put(struct ns_common *ns)
{
- put_user_ns(ns);
+ put_user_ns(to_user_ns(ns));
}
-static int userns_install(struct nsproxy *nsproxy, void *ns)
+static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
- struct user_namespace *user_ns = ns;
+ struct user_namespace *user_ns = to_user_ns(ns);
struct cred *cred;
/* Don't allow gaining capabilities by reentering
@@ -888,19 +996,12 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
return commit_creds(cred);
}
-static unsigned int userns_inum(void *ns)
-{
- struct user_namespace *user_ns = ns;
- return user_ns->proc_inum;
-}
-
const struct proc_ns_operations userns_operations = {
.name = "user",
.type = CLONE_NEWUSER,
.get = userns_get,
.put = userns_put,
.install = userns_install,
- .inum = userns_inum,
};
static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 883aaaa7de8a..831ea7108232 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -42,12 +42,14 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
if (!ns)
return ERR_PTR(-ENOMEM);
- err = proc_alloc_inum(&ns->proc_inum);
+ err = ns_alloc_inum(&ns->ns);
if (err) {
kfree(ns);
return ERR_PTR(err);
}
+ ns->ns.ops = &utsns_operations;
+
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
ns->user_ns = get_user_ns(user_ns);
@@ -84,11 +86,16 @@ void free_uts_ns(struct kref *kref)
ns = container_of(kref, struct uts_namespace, kref);
put_user_ns(ns->user_ns);
- proc_free_inum(ns->proc_inum);
+ ns_free_inum(&ns->ns);
kfree(ns);
}
-static void *utsns_get(struct task_struct *task)
+static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct uts_namespace, ns);
+}
+
+static struct ns_common *utsns_get(struct task_struct *task)
{
struct uts_namespace *ns = NULL;
struct nsproxy *nsproxy;
@@ -101,17 +108,17 @@ static void *utsns_get(struct task_struct *task)
}
task_unlock(task);
- return ns;
+ return ns ? &ns->ns : NULL;
}
-static void utsns_put(void *ns)
+static void utsns_put(struct ns_common *ns)
{
- put_uts_ns(ns);
+ put_uts_ns(to_uts_ns(ns));
}
-static int utsns_install(struct nsproxy *nsproxy, void *new)
+static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
{
- struct uts_namespace *ns = new;
+ struct uts_namespace *ns = to_uts_ns(new);
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -123,18 +130,10 @@ static int utsns_install(struct nsproxy *nsproxy, void *new)
return 0;
}
-static unsigned int utsns_inum(void *vp)
-{
- struct uts_namespace *ns = vp;
-
- return ns->proc_inum;
-}
-
const struct proc_ns_operations utsns_operations = {
.name = "uts",
.type = CLONE_NEWUTS,
.get = utsns_get,
.put = utsns_put,
.install = utsns_install,
- .inum = utsns_inum,
};
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a8d6914030fe..70bf11815f84 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -15,11 +15,6 @@
#include <linux/cpu.h>
#include <linux/nmi.h>
#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/lockdep.h>
-#include <linux/notifier.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/smpboot.h>
@@ -47,6 +42,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
@@ -63,6 +59,25 @@ static unsigned long soft_lockup_nmi_warn;
static int hardlockup_panic =
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+static bool hardlockup_detector_enabled = true;
+/*
+ * We may not want to enable hard lockup detection by default in all cases,
+ * for example when running the kernel as a guest on a hypervisor. In these
+ * cases this function can be called to disable hard lockup detection. This
+ * function should only be executed once by the boot processor before the
+ * kernel command line parameters are parsed, because otherwise it is not
+ * possible to override this in hardlockup_panic_setup().
+ */
+void watchdog_enable_hardlockup_detector(bool val)
+{
+ hardlockup_detector_enabled = val;
+}
+
+bool watchdog_hardlockup_detector_is_enabled(void)
+{
+ return hardlockup_detector_enabled;
+}
+
static int __init hardlockup_panic_setup(char *str)
{
if (!strncmp(str, "panic", 5))
@@ -71,6 +86,14 @@ static int __init hardlockup_panic_setup(char *str)
hardlockup_panic = 0;
else if (!strncmp(str, "0", 1))
watchdog_user_enabled = 0;
+ else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) {
+ /*
+ * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option)
+ * has the same effect.
+ */
+ watchdog_user_enabled = 1;
+ watchdog_enable_hardlockup_detector(true);
+ }
return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -185,7 +208,7 @@ void touch_nmi_watchdog(void)
* case we shouldn't have to worry about the watchdog
* going off.
*/
- __raw_get_cpu_var(watchdog_nmi_touch) = true;
+ raw_cpu_write(watchdog_nmi_touch, true);
touch_softlockup_watchdog();
}
EXPORT_SYMBOL(touch_nmi_watchdog);
@@ -194,8 +217,8 @@ EXPORT_SYMBOL(touch_nmi_watchdog);
void touch_softlockup_watchdog_sync(void)
{
- __raw_get_cpu_var(softlockup_touch_sync) = true;
- __raw_get_cpu_var(watchdog_touch_ts) = 0;
+ __this_cpu_write(softlockup_touch_sync, true);
+ __this_cpu_write(watchdog_touch_ts, 0);
}
#ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -333,8 +356,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
/* only warn once */
- if (__this_cpu_read(soft_watchdog_warn) == true)
+ if (__this_cpu_read(soft_watchdog_warn) == true) {
+ /*
+ * When multiple processes are causing softlockups the
+ * softlockup detector only warns on the first one
+ * because the code relies on a full quiet cycle to
+ * re-arm. The second process prevents the quiet cycle
+ * and never gets reported. Use task pointers to detect
+ * this.
+ */
+ if (__this_cpu_read(softlockup_task_ptr_saved) !=
+ current) {
+ __this_cpu_write(soft_watchdog_warn, false);
+ __touch_watchdog();
+ }
return HRTIMER_RESTART;
+ }
if (softlockup_all_cpu_backtrace) {
/* Prevent multiple soft-lockup reports if one cpu is already
@@ -350,6 +387,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
+ __this_cpu_write(softlockup_task_ptr_saved, current);
print_modules();
print_irqtrace_events(current);
if (regs)
@@ -387,7 +425,7 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
static void watchdog_enable(unsigned int cpu)
{
- struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+ struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
/* kick off the timer for the hardlockup detector */
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -407,7 +445,7 @@ static void watchdog_enable(unsigned int cpu)
static void watchdog_disable(unsigned int cpu)
{
- struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+ struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
watchdog_set_prio(SCHED_NORMAL, 0);
hrtimer_cancel(hrtimer);
@@ -454,6 +492,15 @@ static int watchdog_nmi_enable(unsigned int cpu)
struct perf_event_attr *wd_attr;
struct perf_event *event = per_cpu(watchdog_ev, cpu);
+ /*
+ * Some kernels need to default hard lockup detection to
+ * 'disabled', for example a guest on a hypervisor.
+ */
+ if (!watchdog_hardlockup_detector_is_enabled()) {
+ event = ERR_PTR(-ENOENT);
+ goto handle_err;
+ }
+
/* is it already setup and enabled? */
if (event && event->state > PERF_EVENT_STATE_OFF)
goto out;
@@ -468,6 +515,7 @@ static int watchdog_nmi_enable(unsigned int cpu)
/* Try to register using hardware perf events */
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+handle_err:
/* save cpu0 error for future comparision */
if (cpu == 0 && IS_ERR(event))
cpu0_err = PTR_ERR(event);
@@ -514,7 +562,10 @@ static void watchdog_nmi_disable(unsigned int cpu)
/* should be in cleanup, but blocks oprofile */
perf_event_release_kernel(event);
}
- return;
+ if (cpu == 0) {
+ /* watchdog_nmi_enable() expects this to be zero initially. */
+ cpu0_err = 0;
+ }
}
#else
static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
@@ -534,7 +585,7 @@ static struct smp_hotplug_thread watchdog_threads = {
static void restart_watchdog_hrtimer(void *info)
{
- struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+ struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
int ret;
/*
@@ -610,11 +661,13 @@ int proc_dowatchdog(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int err, old_thresh, old_enabled;
+ bool old_hardlockup;
static DEFINE_MUTEX(watchdog_proc_mutex);
mutex_lock(&watchdog_proc_mutex);
old_thresh = ACCESS_ONCE(watchdog_thresh);
old_enabled = ACCESS_ONCE(watchdog_user_enabled);
+ old_hardlockup = watchdog_hardlockup_detector_is_enabled();
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (err || !write)
@@ -626,15 +679,22 @@ int proc_dowatchdog(struct ctl_table *table, int write,
* disabled. The 'watchdog_running' variable check in
* watchdog_*_all_cpus() function takes care of this.
*/
- if (watchdog_user_enabled && watchdog_thresh)
+ if (watchdog_user_enabled && watchdog_thresh) {
+ /*
+ * Prevent a change in watchdog_thresh accidentally overriding
+ * the enablement of the hardlockup detector.
+ */
+ if (watchdog_user_enabled != old_enabled)
+ watchdog_enable_hardlockup_detector(true);
err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
- else
+ } else
watchdog_disable_all_cpus();
/* Restore old values on failure */
if (err) {
watchdog_thresh = old_thresh;
watchdog_user_enabled = old_enabled;
+ watchdog_enable_hardlockup_detector(old_hardlockup);
}
out:
mutex_unlock(&watchdog_proc_mutex);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5dbe22aa3efd..6202b08f1933 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1804,8 +1804,8 @@ static void pool_mayday_timeout(unsigned long __pool)
struct worker_pool *pool = (void *)__pool;
struct work_struct *work;
- spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */
- spin_lock(&pool->lock);
+ spin_lock_irq(&pool->lock);
+ spin_lock(&wq_mayday_lock); /* for wq->maydays */
if (need_to_create_worker(pool)) {
/*
@@ -1818,8 +1818,8 @@ static void pool_mayday_timeout(unsigned long __pool)
send_mayday(work);
}
- spin_unlock(&pool->lock);
- spin_unlock_irq(&wq_mayday_lock);
+ spin_unlock(&wq_mayday_lock);
+ spin_unlock_irq(&pool->lock);
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}
@@ -2043,9 +2043,10 @@ __acquires(&pool->lock)
* kernels, where a requeueing work item waiting for something to
* happen could deadlock with stop_machine as such work item could
* indefinitely requeue itself while all other CPUs are trapped in
- * stop_machine.
+ * stop_machine. At the same time, report a quiescent RCU state so
+ * the same condition doesn't freeze RCU.
*/
- cond_resched();
+ cond_resched_rcu_qs();
spin_lock_irq(&pool->lock);
@@ -2247,12 +2248,30 @@ repeat:
* Slurp in all works issued via this workqueue and
* process'em.
*/
- WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
+ WARN_ON_ONCE(!list_empty(scheduled));
list_for_each_entry_safe(work, n, &pool->worklist, entry)
if (get_work_pwq(work) == pwq)
move_linked_works(work, scheduled, &n);
- process_scheduled_works(rescuer);
+ if (!list_empty(scheduled)) {
+ process_scheduled_works(rescuer);
+
+ /*
+ * The above execution of rescued work items could
+ * have created more to rescue through
+ * pwq_activate_first_delayed() or chained
+ * queueing. Let's put @pwq back on mayday list so
+ * that such back-to-back work items, which may be
+ * being used to relieve memory pressure, don't
+ * incur MAYDAY_INTERVAL delay inbetween.
+ */
+ if (need_to_create_worker(pool)) {
+ spin_lock(&wq_mayday_lock);
+ get_pwq(pwq);
+ list_move_tail(&pwq->mayday_node, &wq->maydays);
+ spin_unlock(&wq_mayday_lock);
+ }
+ }
/*
* Put the reference grabbed by send_mayday(). @pool won't