summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/cgroup.c28
-rw-r--r--kernel/cpu.c44
-rw-r--r--kernel/cpuset.c13
-rw-r--r--kernel/dma-coherent.c47
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c37
-rw-r--r--kernel/hrtimer.c45
-rw-r--r--kernel/hung_task.c206
-rw-r--r--kernel/irq/chip.c11
-rw-r--r--kernel/irq/handle.c69
-rw-r--r--kernel/irq/internals.h7
-rw-r--r--kernel/irq/manage.c22
-rw-r--r--kernel/irq/migration.c12
-rw-r--r--kernel/irq/numa_migrate.c19
-rw-r--r--kernel/irq/proc.c4
-rw-r--r--kernel/irq/spurious.c14
-rw-r--r--kernel/kallsyms.c16
-rw-r--r--kernel/kgdb.c6
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/module.c35
-rw-r--r--kernel/mutex-debug.c9
-rw-r--r--kernel/mutex-debug.h18
-rw-r--r--kernel/mutex.c121
-rw-r--r--kernel/mutex.h22
-rw-r--r--kernel/panic.c12
-rw-r--r--kernel/posix-cpu-timers.c70
-rw-r--r--kernel/power/disk.c10
-rw-r--r--kernel/power/main.c26
-rw-r--r--kernel/rcuclassic.c2
-rw-r--r--kernel/rcutree.c2
-rw-r--r--kernel/relay.c8
-rw-r--r--kernel/sched.c217
-rw-r--r--kernel/sched_cpupri.c5
-rw-r--r--kernel/sched_debug.c1
-rw-r--r--kernel/sched_fair.c91
-rw-r--r--kernel/sched_features.h4
-rw-r--r--kernel/sched_rt.c588
-rw-r--r--kernel/sched_stats.h33
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/slow-work.c640
-rw-r--r--kernel/smp.c36
-rw-r--r--kernel/softirq.c16
-rw-r--r--kernel/softlockup.c109
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/sysctl_check.c1
-rw-r--r--kernel/time/clockevents.c20
-rw-r--r--kernel/time/tick-common.c26
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/trace/Kconfig55
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/ftrace.c45
-rw-r--r--kernel/trace/kmemtrace.c350
-rw-r--r--kernel/trace/ring_buffer.c70
-rw-r--r--kernel/trace/trace.c859
-rw-r--r--kernel/trace/trace.h60
-rw-r--r--kernel/trace/trace_boot.c1
-rw-r--r--kernel/trace/trace_branch.c278
-rw-r--r--kernel/trace/trace_functions.c193
-rw-r--r--kernel/trace/trace_functions_graph.c8
-rw-r--r--kernel/trace/trace_hw_branches.c174
-rw-r--r--kernel/trace/trace_irqsoff.c1
-rw-r--r--kernel/trace/trace_mmiotrace.c31
-rw-r--r--kernel/trace/trace_output.c829
-rw-r--r--kernel/trace/trace_output.h60
-rw-r--r--kernel/trace/trace_power.c1
-rw-r--r--kernel/trace/trace_sched_wakeup.c79
-rw-r--r--kernel/trace/trace_selftest.c1
-rw-r--r--kernel/trace/trace_stat.c319
-rw-r--r--kernel/trace/trace_stat.h31
-rw-r--r--kernel/trace/trace_workqueue.c281
-rw-r--r--kernel/workqueue.c36
72 files changed, 4816 insertions, 1724 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 170a9213c1b6..57babbbc65ee 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -73,6 +73,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
@@ -92,6 +93,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_SLOW_WORK) += slow-work.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c29831076e7a..5a54ff42874e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1115,8 +1115,10 @@ static void cgroup_kill_sb(struct super_block *sb) {
}
write_unlock(&css_set_lock);
- list_del(&root->root_list);
- root_count--;
+ if (!list_empty(&root->root_list)) {
+ list_del(&root->root_list);
+ root_count--;
+ }
mutex_unlock(&cgroup_mutex);
@@ -2434,7 +2436,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
err_remove:
+ cgroup_lock_hierarchy(root);
list_del(&cgrp->sibling);
+ cgroup_unlock_hierarchy(root);
root->number_of_cgroups--;
err_destroy:
@@ -2507,7 +2511,7 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
int refcnt;
- do {
+ while (1) {
/* We can only remove a CSS with a refcnt==1 */
refcnt = atomic_read(&css->refcnt);
if (refcnt > 1) {
@@ -2521,7 +2525,10 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
* css_tryget() to spin until we set the
* CSS_REMOVED bits or abort
*/
- } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
+ if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
+ break;
+ cpu_relax();
+ }
}
done:
for_each_subsys(cgrp->root, ss) {
@@ -2991,20 +2998,21 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
mutex_unlock(&cgroup_mutex);
return 0;
}
- task_lock(tsk);
- cg = tsk->cgroups;
- parent = task_cgroup(tsk, subsys->subsys_id);
/* Pin the hierarchy */
- if (!atomic_inc_not_zero(&parent->root->sb->s_active)) {
+ if (!atomic_inc_not_zero(&root->sb->s_active)) {
/* We race with the final deactivate_super() */
mutex_unlock(&cgroup_mutex);
return 0;
}
/* Keep the cgroup alive */
+ task_lock(tsk);
+ parent = task_cgroup(tsk, subsys->subsys_id);
+ cg = tsk->cgroups;
get_css_set(cg);
task_unlock(tsk);
+
mutex_unlock(&cgroup_mutex);
/* Now do the VFS work to create a cgroup */
@@ -3043,7 +3051,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
mutex_unlock(&inode->i_mutex);
put_css_set(cg);
- deactivate_super(parent->root->sb);
+ deactivate_super(root->sb);
/* The cgroup is still accessible in the VFS, but
* we're not going to try to rmdir() it at this
* point. */
@@ -3069,7 +3077,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
mutex_lock(&cgroup_mutex);
put_css_set(cg);
mutex_unlock(&cgroup_mutex);
- deactivate_super(parent->root->sb);
+ deactivate_super(root->sb);
return ret;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 79e40f00dcb8..a9033184bfff 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -499,69 +499,65 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);
#ifdef CONFIG_INIT_ALL_POSSIBLE
-static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
+DECLARE_BITMAP(__cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
= CPU_BITS_ALL;
#else
-static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
+DECLARE_BITMAP(__cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
#endif
-const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
-EXPORT_SYMBOL(cpu_possible_mask);
+EXPORT_SYMBOL(__cpu_possible_bits);
-static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
-EXPORT_SYMBOL(cpu_online_mask);
+DECLARE_BITMAP(__cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
+EXPORT_SYMBOL(__cpu_online_bits);
-static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
-EXPORT_SYMBOL(cpu_present_mask);
+DECLARE_BITMAP(__cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
+EXPORT_SYMBOL(__cpu_present_bits);
-static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
-EXPORT_SYMBOL(cpu_active_mask);
+DECLARE_BITMAP(__cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
+EXPORT_SYMBOL(__cpu_active_bits);
void set_cpu_possible(unsigned int cpu, bool possible)
{
if (possible)
- cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
+ cpumask_set_cpu(cpu, to_cpumask(__cpu_possible_bits));
else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
+ cpumask_clear_cpu(cpu, to_cpumask(__cpu_possible_bits));
}
void set_cpu_present(unsigned int cpu, bool present)
{
if (present)
- cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
+ cpumask_set_cpu(cpu, to_cpumask(__cpu_present_bits));
else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
+ cpumask_clear_cpu(cpu, to_cpumask(__cpu_present_bits));
}
void set_cpu_online(unsigned int cpu, bool online)
{
if (online)
- cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
+ cpumask_set_cpu(cpu, to_cpumask(__cpu_online_bits));
else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+ cpumask_clear_cpu(cpu, to_cpumask(__cpu_online_bits));
}
void set_cpu_active(unsigned int cpu, bool active)
{
if (active)
- cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+ cpumask_set_cpu(cpu, to_cpumask(__cpu_active_bits));
else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
+ cpumask_clear_cpu(cpu, to_cpumask(__cpu_active_bits));
}
void init_cpu_present(const struct cpumask *src)
{
- cpumask_copy(to_cpumask(cpu_present_bits), src);
+ cpumask_copy(to_cpumask(__cpu_present_bits), src);
}
void init_cpu_possible(const struct cpumask *src)
{
- cpumask_copy(to_cpumask(cpu_possible_bits), src);
+ cpumask_copy(to_cpumask(__cpu_possible_bits), src);
}
void init_cpu_online(const struct cpumask *src)
{
- cpumask_copy(to_cpumask(cpu_online_bits), src);
+ cpumask_copy(to_cpumask(__cpu_online_bits), src);
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a85678865c5e..f76db9dcaa05 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,6 +61,14 @@
#include <linux/cgroup.h>
/*
+ * Workqueue for cpuset related tasks.
+ *
+ * Using kevent workqueue may cause deadlock when memory_migrate
+ * is set. So we create a separate workqueue thread for cpuset.
+ */
+static struct workqueue_struct *cpuset_wq;
+
+/*
* Tracks how many cpusets are currently defined in system.
* When there is only one cpuset (the root cpuset) we can
* short circuit some hooks.
@@ -831,7 +839,7 @@ static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
*/
static void async_rebuild_sched_domains(void)
{
- schedule_work(&rebuild_sched_domains_work);
+ queue_work(cpuset_wq, &rebuild_sched_domains_work);
}
/*
@@ -2111,6 +2119,9 @@ void __init cpuset_init_smp(void)
hotcpu_notifier(cpuset_track_online_cpus, 0);
hotplug_memory_notifier(cpuset_track_online_nodes, 10);
+
+ cpuset_wq = create_singlethread_workqueue("cpuset");
+ BUG_ON(!cpuset_wq);
}
/**
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 038707404b76..962a3b574f21 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
* @size: size of requested memory area
* @dma_handle: This will be filled with the correct dma handle
* @ret: This pointer will be filled with the virtual address
- * to allocated area.
+ * to allocated area.
*
* This function should be only called from per-arch dma_alloc_coherent()
* to support allocation from per-device coherent memory pools.
@@ -118,31 +118,32 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
mem = dev->dma_mem;
if (!mem)
return 0;
- if (unlikely(size > mem->size))
- return 0;
+
+ *ret = NULL;
+
+ if (unlikely(size > (mem->size << PAGE_SHIFT)))
+ goto err;
pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
- if (pageno >= 0) {
- /*
- * Memory was found in the per-device arena.
- */
- *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
- *ret = mem->virt_base + (pageno << PAGE_SHIFT);
- memset(*ret, 0, size);
- } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) {
- /*
- * The per-device arena is exhausted and we are not
- * permitted to fall back to generic memory.
- */
- *ret = NULL;
- } else {
- /*
- * The per-device arena is exhausted and we are
- * permitted to fall back to generic memory.
- */
- return 0;
- }
+ if (unlikely(pageno < 0))
+ goto err;
+
+ /*
+ * Memory was found in the per-device area.
+ */
+ *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
+ *ret = mem->virt_base + (pageno << PAGE_SHIFT);
+ memset(*ret, 0, size);
+
return 1;
+
+err:
+ /*
+ * In the case where the allocation can not be satisfied from the
+ * per-device area, try to fall back to generic memory if the
+ * constraints allow it.
+ */
+ return mem->flags & DMA_MEMORY_EXCLUSIVE;
}
EXPORT_SYMBOL(dma_alloc_from_coherent);
diff --git a/kernel/exit.c b/kernel/exit.c
index f80dec3f1875..70612c19ac96 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -977,12 +977,9 @@ static void check_stack_usage(void)
{
static DEFINE_SPINLOCK(low_water_lock);
static int lowest_to_date = THREAD_SIZE;
- unsigned long *n = end_of_stack(current);
unsigned long free;
- while (*n == 0)
- n++;
- free = (unsigned long)n - (unsigned long)end_of_stack(current);
+ free = stack_not_used(current);
if (free >= lowest_to_date)
return;
diff --git a/kernel/fork.c b/kernel/fork.c
index bf0cef8bbdf2..28d7669f8965 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -61,6 +61,7 @@
#include <linux/proc_fs.h>
#include <linux/blkdev.h>
#include <trace/sched.h>
+#include <linux/magic.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -176,7 +177,7 @@ void __init fork_init(unsigned long mempages)
/* create a slab on which task_structs can be allocated */
task_struct_cachep =
kmem_cache_create("task_struct", sizeof(struct task_struct),
- ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
+ ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
#endif
/* do the arch specific task caches init */
@@ -212,6 +213,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
{
struct task_struct *tsk;
struct thread_info *ti;
+ unsigned long *stackend;
+
int err;
prepare_to_copy(orig);
@@ -237,6 +240,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
goto out;
setup_thread_stack(tsk, orig);
+ stackend = end_of_stack(tsk);
+ *stackend = STACK_END_MAGIC; /* for overflow detection */
#ifdef CONFIG_CC_STACKPROTECTOR
tsk->stack_canary = get_random_int();
@@ -817,17 +822,17 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
struct signal_struct *sig;
- int ret;
if (clone_flags & CLONE_THREAD) {
- ret = thread_group_cputime_clone_thread(current);
- if (likely(!ret)) {
- atomic_inc(&current->signal->count);
- atomic_inc(&current->signal->live);
- }
- return ret;
+ atomic_inc(&current->signal->count);
+ atomic_inc(&current->signal->live);
+ return 0;
}
sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+
+ if (sig)
+ posix_cpu_timers_init_group(sig);
+
tsk->signal = sig;
if (!sig)
return -ENOMEM;
@@ -864,8 +869,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
task_unlock(current->group_leader);
- posix_cpu_timers_init_group(sig);
-
acct_init_pacct(&sig->pacct);
tty_audit_fork(sig);
@@ -1041,7 +1044,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->default_timer_slack_ns = current->timer_slack_ns;
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#ifdef CONFIG_DETECT_HUNG_TASK
p->last_switch_count = 0;
p->last_switch_timestamp = 0;
#endif
@@ -1470,20 +1473,20 @@ void __init proc_caches_init(void)
{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
- sighand_ctor);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
+ SLAB_NOTRACK, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
mmap_init();
}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2dc30c59c5fd..f394d2a42ca3 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -501,6 +501,13 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
continue;
timer = rb_entry(base->first, struct hrtimer, node);
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+ /*
+ * clock_was_set() has changed base->offset so the
+ * result might be negative. Fix it up to prevent a
+ * false positive in clockevents_program_event()
+ */
+ if (expires.tv64 < 0)
+ expires.tv64 = 0;
if (expires.tv64 < cpu_base->expires_next.tv64)
cpu_base->expires_next = expires;
}
@@ -614,7 +621,9 @@ void clock_was_set(void)
*/
void hres_timers_resume(void)
{
- /* Retrigger the CPU local events: */
+ WARN_ONCE(!irqs_disabled(),
+ KERN_INFO "hres_timers_resume() called with IRQs enabled!");
+
retrigger_next_event(NULL);
}
@@ -1156,6 +1165,29 @@ static void __run_hrtimer(struct hrtimer *timer)
#ifdef CONFIG_HIGH_RES_TIMERS
+static int force_clock_reprogram;
+
+/*
+ * After 5 iteration's attempts, we consider that hrtimer_interrupt()
+ * is hanging, which could happen with something that slows the interrupt
+ * such as the tracing. Then we force the clock reprogramming for each future
+ * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
+ * threshold that we will overwrite.
+ * The next tick event will be scheduled to 3 times we currently spend on
+ * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
+ * 1/4 of their time to process the hrtimer interrupts. This is enough to
+ * let it running without serious starvation.
+ */
+
+static inline void
+hrtimer_interrupt_hanging(struct clock_event_device *dev,
+ ktime_t try_time)
+{
+ force_clock_reprogram = 1;
+ dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
+ printk(KERN_WARNING "hrtimer: interrupt too slow, "
+ "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
+}
/*
* High resolution timer interrupt
* Called with interrupts disabled
@@ -1165,6 +1197,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
struct hrtimer_clock_base *base;
ktime_t expires_next, now;
+ int nr_retries = 0;
int i;
BUG_ON(!cpu_base->hres_active);
@@ -1172,6 +1205,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
dev->next_event.tv64 = KTIME_MAX;
retry:
+ /* 5 retries is enough to notice a hang */
+ if (!(++nr_retries % 5))
+ hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
+
now = ktime_get();
expires_next.tv64 = KTIME_MAX;
@@ -1224,7 +1261,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
/* Reprogramming necessary ? */
if (expires_next.tv64 != KTIME_MAX) {
- if (tick_program_event(expires_next, 0))
+ if (tick_program_event(expires_next, force_clock_reprogram))
goto retry;
}
}
@@ -1578,6 +1615,10 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
break;
#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DYING:
+ case CPU_DYING_FROZEN:
+ clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
+ break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
{
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
new file mode 100644
index 000000000000..ba8ccd432963
--- /dev/null
+++ b/kernel/hung_task.c
@@ -0,0 +1,206 @@
+/*
+ * Detect Hung Task
+ *
+ * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+/*
+ * Have a reasonable limit on the number of tasks checked:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+static unsigned long __read_mostly hung_task_poll_jiffies;
+
+unsigned long __read_mostly sysctl_hung_task_warnings = 10;
+
+static int __read_mostly did_panic;
+
+static struct task_struct *watchdog_task;
+
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * hung task is detected:
+ */
+unsigned int __read_mostly sysctl_hung_task_panic =
+ CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+
+static int __init hung_task_panic_setup(char *str)
+{
+ sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
+
+ return 1;
+}
+__setup("hung_task_panic=", hung_task_panic_setup);
+
+static int
+hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ did_panic = 1;
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+ .notifier_call = hung_task_panic,
+};
+
+/*
+ * Returns seconds, approximately. We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(void)
+{
+ int this_cpu = raw_smp_processor_id();
+
+ return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
+}
+
+static void check_hung_task(struct task_struct *t, unsigned long now,
+ unsigned long timeout)
+{
+ unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+ if (t->flags & PF_FROZEN)
+ return;
+
+ if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+ t->last_switch_count = switch_count;
+ t->last_switch_timestamp = now;
+ return;
+ }
+ if ((long)(now - t->last_switch_timestamp) < timeout)
+ return;
+ if (!sysctl_hung_task_warnings)
+ return;
+ sysctl_hung_task_warnings--;
+
+ /*
+ * Ok, the task did not get scheduled for more than 2 minutes,
+ * complain:
+ */
+ printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+ "%ld seconds.\n", t->comm, t->pid, timeout);
+ printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+ " disables this message.\n");
+ sched_show_task(t);
+ __debug_show_held_locks(t);
+
+ t->last_switch_timestamp = now;
+ touch_nmi_watchdog();
+
+ if (sysctl_hung_task_panic)
+ panic("hung_task: blocked tasks");
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(unsigned long timeout)
+{
+ int max_count = sysctl_hung_task_check_count;
+ unsigned long now = get_timestamp();
+ struct task_struct *g, *t;
+
+ /*
+ * If the system crashed already then all bets are off,
+ * do not report extra hung tasks:
+ */
+ if (test_taint(TAINT_DIE) || did_panic)
+ return;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, t) {
+ if (!--max_count)
+ goto unlock;
+ /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+ if (t->state == TASK_UNINTERRUPTIBLE)
+ check_hung_task(t, now, timeout);
+ } while_each_thread(g, t);
+ unlock:
+ read_unlock(&tasklist_lock);
+}
+
+static void update_poll_jiffies(void)
+{
+ /* timeout of 0 will disable the watchdog */
+ if (sysctl_hung_task_timeout_secs == 0)
+ hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT;
+ else
+ hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2;
+}
+
+/*
+ * Process updating of timeout sysctl
+ */
+int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+
+ if (ret || !write)
+ goto out;
+
+ update_poll_jiffies();
+
+ wake_up_process(watchdog_task);
+
+ out:
+ return ret;
+}
+
+/*
+ * kthread which checks for tasks stuck in D state
+ */
+static int watchdog(void *dummy)
+{
+ set_user_nice(current, 0);
+ update_poll_jiffies();
+
+ for ( ; ; ) {
+ unsigned long timeout;
+
+ while (schedule_timeout_interruptible(hung_task_poll_jiffies));
+
+ /*
+ * Need to cache timeout here to avoid timeout being set
+ * to 0 via sysctl while inside check_hung_*_tasks().
+ */
+ timeout = sysctl_hung_task_timeout_secs;
+ if (timeout)
+ check_hung_uninterruptible_tasks(timeout);
+ }
+
+ return 0;
+}
+
+static int __init hung_task_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+ watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+
+ return 0;
+}
+
+module_init(hung_task_init);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 7de11bd64dfe..e89adf3e69cf 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq)
desc->irq_count = 0;
desc->irqs_unhandled = 0;
#ifdef CONFIG_SMP
- cpumask_setall(&desc->affinity);
+ cpumask_setall(desc->affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ cpumask_clear(desc->pending_mask);
+#endif
#endif
spin_unlock_irqrestore(&desc->lock, flags);
}
@@ -290,7 +293,8 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
desc->chip->mask_ack(irq);
else {
desc->chip->mask(irq);
- desc->chip->ack(irq);
+ if (desc->chip->ack)
+ desc->chip->ack(irq);
}
}
@@ -476,7 +480,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
kstat_incr_irqs_this_cpu(irq, desc);
/* Start handling the irq */
- desc->chip->ack(irq);
+ if (desc->chip->ack)
+ desc->chip->ack(irq);
desc = irq_remap_to_desc(irq, desc);
/* Mark the IRQ currently in progress.*/
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 48299a8a22f8..8b70e5ef71a4 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,6 +17,7 @@
#include <linux/kernel_stat.h>
#include <linux/rculist.h>
#include <linux/hash.h>
+#include <linux/bootmem.h>
#include "internals.h"
@@ -39,6 +40,18 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
ack_bad_irq(irq);
}
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
+static void __init init_irq_default_affinity(void)
+{
+ alloc_bootmem_cpumask_var(&irq_default_affinity);
+ cpumask_setall(irq_default_affinity);
+}
+#else
+static void __init init_irq_default_affinity(void)
+{
+}
+#endif
+
/*
* Linux has a controller-independent interrupt architecture.
* Every controller has a 'controller-template', that is used
@@ -57,6 +70,7 @@ int nr_irqs = NR_IRQS;
EXPORT_SYMBOL_GPL(nr_irqs);
#ifdef CONFIG_SPARSE_IRQ
+
static struct irq_desc irq_desc_init = {
.irq = -1,
.status = IRQ_DISABLED,
@@ -64,9 +78,6 @@ static struct irq_desc irq_desc_init = {
.handle_irq = handle_bad_irq,
.depth = 1,
.lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-#ifdef CONFIG_SMP
- .affinity = CPU_MASK_ALL
-#endif
};
void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
@@ -101,6 +112,10 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
printk(KERN_ERR "can not alloc kstat_irqs\n");
BUG_ON(1);
}
+ if (!init_alloc_desc_masks(desc, cpu, false)) {
+ printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
+ BUG_ON(1);
+ }
arch_init_chip_data(desc, cpu);
}
@@ -109,7 +124,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
*/
DEFINE_SPINLOCK(sparse_irq_lock);
-struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
+struct irq_desc **irq_desc_ptrs __read_mostly;
static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
[0 ... NR_IRQS_LEGACY-1] = {
@@ -119,14 +134,10 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
.handle_irq = handle_bad_irq,
.depth = 1,
.lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-#ifdef CONFIG_SMP
- .affinity = CPU_MASK_ALL
-#endif
}
};
-/* FIXME: use bootmem alloc ...*/
-static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
+static unsigned int *kstat_irqs_legacy;
int __init early_irq_init(void)
{
@@ -134,18 +145,32 @@ int __init early_irq_init(void)
int legacy_count;
int i;
+ init_irq_default_affinity();
+
+ /* initialize nr_irqs based on nr_cpu_ids */
+ arch_probe_nr_irqs();
+ printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
+
desc = irq_desc_legacy;
legacy_count = ARRAY_SIZE(irq_desc_legacy);
+ /* allocate irq_desc_ptrs array based on nr_irqs */
+ irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+
+ /* allocate based on nr_cpu_ids */
+ /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
+ kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
+ sizeof(int));
+
for (i = 0; i < legacy_count; i++) {
desc[i].irq = i;
- desc[i].kstat_irqs = kstat_irqs_legacy[i];
+ desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-
+ init_alloc_desc_masks(&desc[i], 0, true);
irq_desc_ptrs[i] = desc + i;
}
- for (i = legacy_count; i < NR_IRQS; i++)
+ for (i = legacy_count; i < nr_irqs; i++)
irq_desc_ptrs[i] = NULL;
return arch_early_irq_init();
@@ -153,7 +178,10 @@ int __init early_irq_init(void)
struct irq_desc *irq_to_desc(unsigned int irq)
{
- return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL;
+ if (irq_desc_ptrs && irq < nr_irqs)
+ return irq_desc_ptrs[irq];
+
+ return NULL;
}
struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
@@ -162,10 +190,9 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
unsigned long flags;
int node;
- if (irq >= NR_IRQS) {
- printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n",
- irq, NR_IRQS);
- WARN_ON(1);
+ if (irq >= nr_irqs) {
+ WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
+ irq, nr_irqs);
return NULL;
}
@@ -207,9 +234,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
.handle_irq = handle_bad_irq,
.depth = 1,
.lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
-#ifdef CONFIG_SMP
- .affinity = CPU_MASK_ALL
-#endif
}
};
@@ -220,12 +244,17 @@ int __init early_irq_init(void)
int count;
int i;
+ init_irq_default_affinity();
+
+ printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+
desc = irq_desc;
count = ARRAY_SIZE(irq_desc);
for (i = 0; i < count; i++) {
desc[i].irq = i;
desc[i].kstat_irqs = kstat_irqs_all[i];
+ init_alloc_desc_masks(&desc[i], 0, true);
}
return arch_early_irq_init();
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e6d0a43cc125..40416a81a0f5 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
extern struct lock_class_key irq_desc_lock_class;
extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
extern spinlock_t sparse_irq_lock;
+
+#ifdef CONFIG_SPARSE_IRQ
+/* irq_desc_ptrs allocated at boot time */
+extern struct irq_desc **irq_desc_ptrs;
+#else
+/* irq_desc_ptrs is a fixed size array */
extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
+#endif
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index cd0cd8dcb345..a3a5dc9ef346 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -15,17 +15,9 @@
#include "internals.h"
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
cpumask_var_t irq_default_affinity;
-static int init_irq_default_affinity(void)
-{
- alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL);
- cpumask_setall(irq_default_affinity);
- return 0;
-}
-core_initcall(init_irq_default_affinity);
-
/**
* synchronize_irq - wait for pending IRQ handlers (on other CPUs)
* @irq: interrupt number to wait for
@@ -98,14 +90,14 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
- cpumask_copy(&desc->affinity, cpumask);
+ cpumask_copy(desc->affinity, cpumask);
desc->chip->set_affinity(irq, cpumask);
} else {
desc->status |= IRQ_MOVE_PENDING;
- cpumask_copy(&desc->pending_mask, cpumask);
+ cpumask_copy(desc->pending_mask, cpumask);
}
#else
- cpumask_copy(&desc->affinity, cpumask);
+ cpumask_copy(desc->affinity, cpumask);
desc->chip->set_affinity(irq, cpumask);
#endif
desc->status |= IRQ_AFFINITY_SET;
@@ -127,16 +119,16 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
* one of the targets is online.
*/
if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
- if (cpumask_any_and(&desc->affinity, cpu_online_mask)
+ if (cpumask_any_and(desc->affinity, cpu_online_mask)
< nr_cpu_ids)
goto set_affinity;
else
desc->status &= ~IRQ_AFFINITY_SET;
}
- cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity);
+ cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
set_affinity:
- desc->chip->set_affinity(irq, &desc->affinity);
+ desc->chip->set_affinity(irq, desc->affinity);
return 0;
}
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index bd72329e630c..e05ad9be43b7 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -18,7 +18,7 @@ void move_masked_irq(int irq)
desc->status &= ~IRQ_MOVE_PENDING;
- if (unlikely(cpumask_empty(&desc->pending_mask)))
+ if (unlikely(cpumask_empty(desc->pending_mask)))
return;
if (!desc->chip->set_affinity)
@@ -38,13 +38,13 @@ void move_masked_irq(int irq)
* For correct operation this depends on the caller
* masking the irqs.
*/
- if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask)
+ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
< nr_cpu_ids)) {
- cpumask_and(&desc->affinity,
- &desc->pending_mask, cpu_online_mask);
- desc->chip->set_affinity(irq, &desc->affinity);
+ cpumask_and(desc->affinity,
+ desc->pending_mask, cpu_online_mask);
+ desc->chip->set_affinity(irq, desc->affinity);
}
- cpumask_clear(&desc->pending_mask);
+ cpumask_clear(desc->pending_mask);
}
void move_native_irq(int irq)
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index ecf765c6a77a..666260e4c065 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -38,15 +38,22 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
old_desc->kstat_irqs = NULL;
}
-static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
struct irq_desc *desc, int cpu)
{
memcpy(desc, old_desc, sizeof(struct irq_desc));
+ if (!init_alloc_desc_masks(desc, cpu, false)) {
+ printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
+ "for migration.\n", irq);
+ return false;
+ }
spin_lock_init(&desc->lock);
desc->cpu = cpu;
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+ init_copy_desc_masks(old_desc, desc);
arch_init_copy_chip_data(old_desc, desc, cpu);
+ return true;
}
static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
@@ -76,12 +83,18 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
node = cpu_to_node(cpu);
desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
if (!desc) {
- printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq);
+ printk(KERN_ERR "irq %d: can not get new irq_desc "
+ "for migration.\n", irq);
+ /* still use old one */
+ desc = old_desc;
+ goto out_unlock;
+ }
+ if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
/* still use old one */
+ kfree(desc);
desc = old_desc;
goto out_unlock;
}
- init_copy_one_irq_desc(irq, old_desc, desc, cpu);
irq_desc_ptrs[irq] = desc;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index aae3f742bcec..692363dd591f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,11 +20,11 @@ static struct proc_dir_entry *root_irq_dir;
static int irq_affinity_proc_show(struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
- const struct cpumask *mask = &desc->affinity;
+ const struct cpumask *mask = desc->affinity;
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (desc->status & IRQ_MOVE_PENDING)
- mask = &desc->pending_mask;
+ mask = desc->pending_mask;
#endif
seq_cpumask(m, mask);
seq_putc(m, '\n');
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dd364c11e56e..4d568294de3e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
return ok;
}
-static void poll_spurious_irqs(unsigned long dummy)
+static void poll_all_shared_irqs(void)
{
struct irq_desc *desc;
int i;
@@ -123,11 +123,23 @@ static void poll_spurious_irqs(unsigned long dummy)
try_one_irq(i, desc);
}
+}
+
+static void poll_spurious_irqs(unsigned long dummy)
+{
+ poll_all_shared_irqs();
mod_timer(&poll_spurious_irq_timer,
jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
+#ifdef CONFIG_DEBUG_SHIRQ
+void debug_poll_all_shared_irqs(void)
+{
+ poll_all_shared_irqs();
+}
+#endif
+
/*
* If 99,900 of the previous 100,000 interrupts have not been handled
* then assume that the IRQ is stuck in some manner. Drop a diagnostic
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index e694afa0eb8c..7b8b0f21a5b1 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,19 +30,20 @@
#define all_var 0
#endif
-extern const unsigned long kallsyms_addresses[];
-extern const u8 kallsyms_names[];
+/* These will be re-linked against their real values during the second link stage */
+extern const unsigned long kallsyms_addresses[] __attribute__((weak));
+extern const u8 kallsyms_names[] __attribute__((weak));
/* tell the compiler that the count isn't in the small data section if the arch
* has one (eg: FRV)
*/
extern const unsigned long kallsyms_num_syms
- __attribute__((__section__(".rodata")));
+__attribute__((weak, section(".rodata")));
-extern const u8 kallsyms_token_table[];
-extern const u16 kallsyms_token_index[];
+extern const u8 kallsyms_token_table[] __attribute__((weak));
+extern const u16 kallsyms_token_index[] __attribute__((weak));
-extern const unsigned long kallsyms_markers[];
+extern const unsigned long kallsyms_markers[] __attribute__((weak));
static inline int is_kernel_inittext(unsigned long addr)
{
@@ -167,6 +168,9 @@ static unsigned long get_symbol_pos(unsigned long addr,
unsigned long symbol_start = 0, symbol_end = 0;
unsigned long i, low, high, mid;
+ /* This kernel should never had been booted. */
+ BUG_ON(!kallsyms_addresses);
+
/* do a binary search on the sorted kallsyms_addresses array */
low = 0;
high = kallsyms_num_syms;
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index e4dcfb2272a4..b5ea4650f2ea 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -363,7 +363,7 @@ static void put_packet(char *buffer)
* Convert the memory pointed to by mem into hex, placing result in buf.
* Return a pointer to the last char put in buf (null). May return an error.
*/
-int kgdb_mem2hex(char *mem, char *buf, int count)
+int __weak kgdb_mem2hex(char *mem, char *buf, int count)
{
char *tmp;
int err;
@@ -393,7 +393,7 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
* 0x7d escaped with 0x7d. Return a pointer to the character after
* the last byte written.
*/
-static int kgdb_ebin2mem(char *buf, char *mem, int count)
+int __weak kgdb_ebin2mem(char *buf, char *mem, int count)
{
int err = 0;
char c;
@@ -418,7 +418,7 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count)
* Return a pointer to the character AFTER the last byte written.
* May return an error.
*/
-int kgdb_hex2mem(char *buf, char *mem, int count)
+int __weak kgdb_hex2mem(char *buf, char *mem, int count)
{
char *tmp_raw;
char *tmp_hex;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4fbc456f393d..fd765f9f1b2a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -76,6 +76,7 @@ static int kthread(void *_create)
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
+ create->result = current;
complete(&create->started);
schedule();
@@ -101,9 +102,6 @@ static void create_kthread(struct kthread_create_info *create)
} else {
struct sched_param param = { .sched_priority = 0 };
wait_for_completion(&create->started);
- read_lock(&tasklist_lock);
- create->result = find_task_by_pid_ns(pid, &init_pid_ns);
- read_unlock(&tasklist_lock);
/*
* root may have changed our (kthreadd's) priority or CPU mask.
* The kernel thread should not inherit these properties.
diff --git a/kernel/module.c b/kernel/module.c
index e8b51d41dd72..ba22484a987e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -573,13 +573,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
/* Init the unload section of the module. */
static void module_unload_init(struct module *mod)
{
- unsigned int i;
+ int cpu;
INIT_LIST_HEAD(&mod->modules_which_use_me);
- for (i = 0; i < NR_CPUS; i++)
- local_set(&mod->ref[i].count, 0);
+ for_each_possible_cpu(cpu)
+ local_set(__module_ref_addr(mod, cpu), 0);
/* Hold reference count during initialization. */
- local_set(&mod->ref[raw_smp_processor_id()].count, 1);
+ local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
/* Backwards compatibility macros put refcount during init. */
mod->waiter = current;
}
@@ -717,10 +717,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
unsigned int module_refcount(struct module *mod)
{
- unsigned int i, total = 0;
+ unsigned int total = 0;
+ int cpu;
- for (i = 0; i < NR_CPUS; i++)
- total += local_read(&mod->ref[i].count);
+ for_each_possible_cpu(cpu)
+ total += local_read(__module_ref_addr(mod, cpu));
return total;
}
EXPORT_SYMBOL(module_refcount);
@@ -894,7 +895,7 @@ void module_put(struct module *module)
{
if (module) {
unsigned int cpu = get_cpu();
- local_dec(&module->ref[cpu].count);
+ local_dec(__module_ref_addr(module, cpu));
/* Maybe they're waiting for us to drop reference? */
if (unlikely(!module_is_live(module)))
wake_up_process(module->waiter);
@@ -1464,7 +1465,10 @@ static void free_module(struct module *mod)
kfree(mod->args);
if (mod->percpu)
percpu_modfree(mod->percpu);
-
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ if (mod->refptr)
+ percpu_modfree(mod->refptr);
+#endif
/* Free lock-classes: */
lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -2011,6 +2015,14 @@ static noinline struct module *load_module(void __user *umod,
if (err < 0)
goto free_mod;
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
+ mod->name);
+ if (!mod->refptr) {
+ err = -ENOMEM;
+ goto free_mod;
+ }
+#endif
if (pcpuindex) {
/* We have a special allocation for this section. */
percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
@@ -2018,7 +2030,7 @@ static noinline struct module *load_module(void __user *umod,
mod->name);
if (!percpu) {
err = -ENOMEM;
- goto free_mod;
+ goto free_percpu;
}
sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
mod->percpu = percpu;
@@ -2282,6 +2294,9 @@ static noinline struct module *load_module(void __user *umod,
free_percpu:
if (percpu)
percpu_modfree(percpu);
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ percpu_modfree(mod->refptr);
+#endif
free_mod:
kfree(args);
free_hdr:
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 1d94160eb532..50d022e5a560 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -26,11 +26,6 @@
/*
* Must be called with lock->wait_lock held.
*/
-void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
-{
- lock->owner = new_owner;
-}
-
void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
{
memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
@@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
/* Mark the current thread as blocked on the lock: */
ti->task->blocked_on = waiter;
- waiter->lock = lock;
}
void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
@@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lock)
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
- DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+ mutex_clear_owner(lock);
}
void debug_mutex_init(struct mutex *lock, const char *name,
@@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock, const char *name,
debug_check_no_locks_freed((void *)lock, sizeof(*lock));
lockdep_init_map(&lock->dep_map, name, key, 0);
#endif
- lock->owner = NULL;
lock->magic = lock;
}
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index babfbdfc534b..6b2d735846a5 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -13,14 +13,6 @@
/*
* This must be called with lock->wait_lock held.
*/
-extern void
-debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
-
-static inline void debug_mutex_clear_owner(struct mutex *lock)
-{
- lock->owner = NULL;
-}
-
extern void debug_mutex_lock_common(struct mutex *lock,
struct mutex_waiter *waiter);
extern void debug_mutex_wake_waiter(struct mutex *lock,
@@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mutex *lock);
extern void debug_mutex_init(struct mutex *lock, const char *name,
struct lock_class_key *key);
+static inline void mutex_set_owner(struct mutex *lock)
+{
+ lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+ lock->owner = NULL;
+}
+
#define spin_lock_mutex(lock, flags) \
do { \
struct mutex *l = container_of(lock, struct mutex, wait_lock); \
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 4f45d4b658ef..5d79781394a3 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -10,6 +10,11 @@
* Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
* David Howells for suggestions and improvements.
*
+ * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
+ * from the -rt tree, where it was originally implemented for rtmutexes
+ * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
+ * and Sven Dietrich.
+ *
* Also see Documentation/mutex-design.txt.
*/
#include <linux/mutex.h>
@@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
atomic_set(&lock->count, 1);
spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
+ mutex_clear_owner(lock);
debug_mutex_init(lock, name, key);
}
@@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mutex *lock)
* 'unlocked' into 'locked' state.
*/
__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+ mutex_set_owner(lock);
}
EXPORT_SYMBOL(mutex_lock);
@@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *lock)
* The unlocking fastpath is the 0->1 transition from 'locked'
* into 'unlocked' state:
*/
+#ifndef CONFIG_DEBUG_MUTEXES
+ /*
+ * When debugging is enabled we must not clear the owner before time,
+ * the slow path will always be taken, and that clears the owner field
+ * after verifying that it was indeed current.
+ */
+ mutex_clear_owner(lock);
+#endif
__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
}
@@ -129,21 +144,75 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
{
struct task_struct *task = current;
struct mutex_waiter waiter;
- unsigned int old_val;
unsigned long flags;
+ preempt_disable();
+ mutex_acquire(&lock->dep_map, subclass, 0, ip);
+#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
+ /*
+ * Optimistic spinning.
+ *
+ * We try to spin for acquisition when we find that there are no
+ * pending waiters and the lock owner is currently running on a
+ * (different) CPU.
+ *
+ * The rationale is that if the lock owner is running, it is likely to
+ * release the lock soon.
+ *
+ * Since this needs the lock owner, and this mutex implementation
+ * doesn't track the owner atomically in the lock field, we need to
+ * track it non-atomically.
+ *
+ * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
+ * to serialize everything.
+ */
+
+ for (;;) {
+ struct thread_info *owner;
+
+ /*
+ * If there's an owner, wait for it to either
+ * release the lock or go to sleep.
+ */
+ owner = ACCESS_ONCE(lock->owner);
+ if (owner && !mutex_spin_on_owner(lock, owner))
+ break;
+
+ if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+ lock_acquired(&lock->dep_map, ip);
+ mutex_set_owner(lock);
+ preempt_enable();
+ return 0;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(task)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax();
+ }
+#endif
spin_lock_mutex(&lock->wait_lock, flags);
debug_mutex_lock_common(lock, &waiter);
- mutex_acquire(&lock->dep_map, subclass, 0, ip);
debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
/* add waiting tasks to the end of the waitqueue (FIFO): */
list_add_tail(&waiter.list, &lock->wait_list);
waiter.task = task;
- old_val = atomic_xchg(&lock->count, -1);
- if (old_val == 1)
+ if (atomic_xchg(&lock->count, -1) == 1)
goto done;
lock_contended(&lock->dep_map, ip);
@@ -158,8 +227,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* that when we release the lock, we properly wake up the
* other waiters:
*/
- old_val = atomic_xchg(&lock->count, -1);
- if (old_val == 1)
+ if (atomic_xchg(&lock->count, -1) == 1)
break;
/*
@@ -173,21 +241,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
spin_unlock_mutex(&lock->wait_lock, flags);
debug_mutex_free_waiter(&waiter);
+ preempt_enable();
return -EINTR;
}
__set_task_state(task, state);
/* didnt get the lock, go to sleep: */
spin_unlock_mutex(&lock->wait_lock, flags);
- schedule();
+ __schedule();
spin_lock_mutex(&lock->wait_lock, flags);
}
done:
lock_acquired(&lock->dep_map, ip);
/* got the lock - rejoice! */
- mutex_remove_waiter(lock, &waiter, task_thread_info(task));
- debug_mutex_set_owner(lock, task_thread_info(task));
+ mutex_remove_waiter(lock, &waiter, current_thread_info());
+ mutex_set_owner(lock);
/* set it to 0 if there are no waiters left: */
if (likely(list_empty(&lock->wait_list)))
@@ -196,6 +265,7 @@ done:
spin_unlock_mutex(&lock->wait_lock, flags);
debug_mutex_free_waiter(&waiter);
+ preempt_enable();
return 0;
}
@@ -222,7 +292,8 @@ int __sched
mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
- return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
+ subclass, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -260,8 +331,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
wake_up_process(waiter->task);
}
- debug_mutex_clear_owner(lock);
-
spin_unlock_mutex(&lock->wait_lock, flags);
}
@@ -298,18 +367,30 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
*/
int __sched mutex_lock_interruptible(struct mutex *lock)
{
+ int ret;
+
might_sleep();
- return __mutex_fastpath_lock_retval
+ ret = __mutex_fastpath_lock_retval
(&lock->count, __mutex_lock_interruptible_slowpath);
+ if (!ret)
+ mutex_set_owner(lock);
+
+ return ret;
}
EXPORT_SYMBOL(mutex_lock_interruptible);
int __sched mutex_lock_killable(struct mutex *lock)
{
+ int ret;
+
might_sleep();
- return __mutex_fastpath_lock_retval
+ ret = __mutex_fastpath_lock_retval
(&lock->count, __mutex_lock_killable_slowpath);
+ if (!ret)
+ mutex_set_owner(lock);
+
+ return ret;
}
EXPORT_SYMBOL(mutex_lock_killable);
@@ -352,9 +433,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
prev = atomic_xchg(&lock->count, -1);
if (likely(prev == 1)) {
- debug_mutex_set_owner(lock, current_thread_info());
+ mutex_set_owner(lock);
mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
}
+
/* Set it back to 0 if there are no waiters: */
if (likely(list_empty(&lock->wait_list)))
atomic_set(&lock->count, 0);
@@ -380,8 +462,13 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
*/
int __sched mutex_trylock(struct mutex *lock)
{
- return __mutex_fastpath_trylock(&lock->count,
- __mutex_trylock_slowpath);
+ int ret;
+
+ ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
+ if (ret)
+ mutex_set_owner(lock);
+
+ return ret;
}
EXPORT_SYMBOL(mutex_trylock);
diff --git a/kernel/mutex.h b/kernel/mutex.h
index a075dafbb290..67578ca48f94 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -16,8 +16,26 @@
#define mutex_remove_waiter(lock, waiter, ti) \
__list_del((waiter)->list.prev, (waiter)->list.next)
-#define debug_mutex_set_owner(lock, new_owner) do { } while (0)
-#define debug_mutex_clear_owner(lock) do { } while (0)
+#ifdef CONFIG_SMP
+static inline void mutex_set_owner(struct mutex *lock)
+{
+ lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+ lock->owner = NULL;
+}
+#else
+static inline void mutex_set_owner(struct mutex *lock)
+{
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+}
+#endif
+
#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
#define debug_mutex_free_waiter(waiter) do { } while (0)
#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
diff --git a/kernel/panic.c b/kernel/panic.c
index 2a2ff36ff44d..33cab3de1763 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -74,6 +74,9 @@ NORET_TYPE void panic(const char * fmt, ...)
vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+ dump_stack();
+#endif
bust_spinlocks(0);
/*
@@ -355,15 +358,22 @@ EXPORT_SYMBOL(warn_slowpath);
#endif
#ifdef CONFIG_CC_STACKPROTECTOR
+
+#ifndef GCC_HAS_SP
+#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this.
+#endif
+
/*
* Called when gcc's -fstack-protector feature is used, and
* gcc detects corruption of the on-stack canary value
*/
void __stack_chk_fail(void)
{
- panic("stack-protector: Kernel stack is corrupted");
+ panic("stack-protector: Kernel stack is corrupted in: %p\n",
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(__stack_chk_fail);
+
#endif
core_param(panic, panic_timeout, int, 0644);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 157de3a47832..fa07da94d7be 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -10,76 +10,6 @@
#include <linux/kernel_stat.h>
/*
- * Allocate the thread_group_cputime structure appropriately and fill in the
- * current values of the fields. Called from copy_signal() via
- * thread_group_cputime_clone_thread() when adding a second or subsequent
- * thread to a thread group. Assumes interrupts are enabled when called.
- */
-int thread_group_cputime_alloc(struct task_struct *tsk)
-{
- struct signal_struct *sig = tsk->signal;
- struct task_cputime *cputime;
-
- /*
- * If we have multiple threads and we don't already have a
- * per-CPU task_cputime struct (checked in the caller), allocate
- * one and fill it in with the times accumulated so far. We may
- * race with another thread so recheck after we pick up the sighand
- * lock.
- */
- cputime = alloc_percpu(struct task_cputime);
- if (cputime == NULL)
- return -ENOMEM;
- spin_lock_irq(&tsk->sighand->siglock);
- if (sig->cputime.totals) {
- spin_unlock_irq(&tsk->sighand->siglock);
- free_percpu(cputime);
- return 0;
- }
- sig->cputime.totals = cputime;
- cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
- cputime->utime = tsk->utime;
- cputime->stime = tsk->stime;
- cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
- spin_unlock_irq(&tsk->sighand->siglock);
- return 0;
-}
-
-/**
- * thread_group_cputime - Sum the thread group time fields across all CPUs.
- *
- * @tsk: The task we use to identify the thread group.
- * @times: task_cputime structure in which we return the summed fields.
- *
- * Walk the list of CPUs to sum the per-CPU time fields in the thread group
- * time structure.
- */
-void thread_group_cputime(
- struct task_struct *tsk,
- struct task_cputime *times)
-{
- struct task_cputime *totals, *tot;
- int i;
-
- totals = tsk->signal->cputime.totals;
- if (!totals) {
- times->utime = tsk->utime;
- times->stime = tsk->stime;
- times->sum_exec_runtime = tsk->se.sum_exec_runtime;
- return;
- }
-
- times->stime = times->utime = cputime_zero;
- times->sum_exec_runtime = 0;
- for_each_possible_cpu(i) {
- tot = per_cpu_ptr(totals, i);
- times->utime = cputime_add(times->utime, tot->utime);
- times->stime = cputime_add(times->stime, tot->stime);
- times->sum_exec_runtime += tot->sum_exec_runtime;
- }
-}
-
-/*
* Called after updating RLIMIT_CPU to set timer expiration if necessary.
*/
void update_rlimit_cpu(unsigned long rlim_new)
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 45e8541ab7e3..432ee575c9ee 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -71,6 +71,14 @@ void hibernation_set_ops(struct platform_hibernation_ops *ops)
mutex_unlock(&pm_mutex);
}
+static bool entering_platform_hibernation;
+
+bool system_entering_hibernation(void)
+{
+ return entering_platform_hibernation;
+}
+EXPORT_SYMBOL(system_entering_hibernation);
+
#ifdef CONFIG_PM_DEBUG
static void hibernation_debug_sleep(void)
{
@@ -411,6 +419,7 @@ int hibernation_platform_enter(void)
if (error)
goto Close;
+ entering_platform_hibernation = true;
suspend_console();
error = device_suspend(PMSG_HIBERNATE);
if (error) {
@@ -445,6 +454,7 @@ int hibernation_platform_enter(void)
Finish:
hibernation_ops->finish();
Resume_devices:
+ entering_platform_hibernation = false;
device_resume(PMSG_RESTORE);
resume_console();
Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 239988873971..b4d219016b6c 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -57,16 +57,6 @@ int pm_notifier_call_chain(unsigned long val)
#ifdef CONFIG_PM_DEBUG
int pm_test_level = TEST_NONE;
-static int suspend_test(int level)
-{
- if (pm_test_level == level) {
- printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
- mdelay(5000);
- return 1;
- }
- return 0;
-}
-
static const char * const pm_tests[__TEST_AFTER_LAST] = {
[TEST_NONE] = "none",
[TEST_CORE] = "core",
@@ -125,14 +115,24 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
}
power_attr(pm_test);
-#else /* !CONFIG_PM_DEBUG */
-static inline int suspend_test(int level) { return 0; }
-#endif /* !CONFIG_PM_DEBUG */
+#endif /* CONFIG_PM_DEBUG */
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_SUSPEND
+static int suspend_test(int level)
+{
+#ifdef CONFIG_PM_DEBUG
+ if (pm_test_level == level) {
+ printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
+ mdelay(5000);
+ return 1;
+ }
+#endif /* !CONFIG_PM_DEBUG */
+ return 0;
+}
+
#ifdef CONFIG_PM_TEST_SUSPEND
/*
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 490934fc7ac3..bd5a9003497c 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -716,7 +716,7 @@ void rcu_check_callbacks(int cpu, int user)
raise_rcu_softirq();
}
-static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
+static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
unsigned long flags;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f2d8638e6c60..b2fd602a6f6f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1314,7 +1314,7 @@ int rcu_needs_cpu(int cpu)
* access due to the fact that this CPU cannot possibly have any RCU
* callbacks in flight yet.
*/
-static void
+static void __cpuinit
rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
diff --git a/kernel/relay.c b/kernel/relay.c
index 09ac2008f77b..edc0ba6d8160 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -663,8 +663,10 @@ int relay_late_setup_files(struct rchan *chan,
mutex_lock(&relay_channels_mutex);
/* Is chan already set up? */
- if (unlikely(chan->has_base_filename))
+ if (unlikely(chan->has_base_filename)) {
+ mutex_unlock(&relay_channels_mutex);
return -EEXIST;
+ }
chan->has_base_filename = 1;
chan->parent = parent;
curr_cpu = get_cpu();
@@ -675,9 +677,7 @@ int relay_late_setup_files(struct rchan *chan,
*/
for_each_online_cpu(i) {
if (unlikely(!chan->buf[i])) {
- printk(KERN_ERR "relay_late_setup_files: CPU %u "
- "has no buffer, it must have!\n", i);
- BUG();
+ WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
err = -EINVAL;
break;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 52bbf1c842a8..3a4763257395 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -467,11 +467,17 @@ struct rt_rq {
struct rt_prio_array active;
unsigned long rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- int highest_prio; /* highest queued rt task prio */
+ struct {
+ int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
+ int next; /* next highest */
+#endif
+ } highest_prio;
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
int overloaded;
+ struct plist_head pushable_tasks;
#endif
int rt_throttled;
u64 rt_time;
@@ -1610,21 +1616,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
#endif
+#ifdef CONFIG_PREEMPT
+
/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations. This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below. However, it
+ * also adds more overhead and therefore may reduce throughput.
*/
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(this_rq->lock)
+ __acquires(busiest->lock)
+ __acquires(this_rq->lock)
+{
+ spin_unlock(&this_rq->lock);
+ double_rq_lock(this_rq, busiest);
+
+ return 1;
+}
+
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry. This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__releases(this_rq->lock)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
int ret = 0;
- if (unlikely(!irqs_disabled())) {
- /* printk() doesn't work good under rq->lock */
- spin_unlock(&this_rq->lock);
- BUG_ON(1);
- }
if (unlikely(!spin_trylock(&busiest->lock))) {
if (busiest < this_rq) {
spin_unlock(&this_rq->lock);
@@ -1637,6 +1664,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
return ret;
}
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+ if (unlikely(!irqs_disabled())) {
+ /* printk() doesn't work good under rq->lock */
+ spin_unlock(&this_rq->lock);
+ BUG_ON(1);
+ }
+
+ return _double_lock_balance(this_rq, busiest);
+}
+
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
@@ -1705,6 +1748,9 @@ static void update_avg(u64 *avg, u64 sample)
static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
{
+ if (wakeup)
+ p->se.start_runtime = p->se.sum_exec_runtime;
+
sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, wakeup);
p->se.on_rq = 1;
@@ -1712,10 +1758,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
{
- if (sleep && p->se.last_wakeup) {
- update_avg(&p->se.avg_overlap,
- p->se.sum_exec_runtime - p->se.last_wakeup);
- p->se.last_wakeup = 0;
+ if (sleep) {
+ if (p->se.last_wakeup) {
+ update_avg(&p->se.avg_overlap,
+ p->se.sum_exec_runtime - p->se.last_wakeup);
+ p->se.last_wakeup = 0;
+ } else {
+ update_avg(&p->se.avg_wakeup,
+ sysctl_sched_wakeup_granularity);
+ }
}
sched_info_dequeued(p);
@@ -2266,6 +2317,16 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
if (!sched_feat(SYNC_WAKEUPS))
sync = 0;
+ if (!sync) {
+ if (current->se.avg_overlap < sysctl_sched_migration_cost &&
+ p->se.avg_overlap < sysctl_sched_migration_cost)
+ sync = 1;
+ } else {
+ if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
+ p->se.avg_overlap >= sysctl_sched_migration_cost)
+ sync = 0;
+ }
+
#ifdef CONFIG_SMP
if (sched_feat(LB_WAKEUP_UPDATE)) {
struct sched_domain *sd;
@@ -2345,6 +2406,22 @@ out_activate:
activate_task(rq, p, 1);
success = 1;
+ /*
+ * Only attribute actual wakeups done by this task.
+ */
+ if (!in_interrupt()) {
+ struct sched_entity *se = &current->se;
+ u64 sample = se->sum_exec_runtime;
+
+ if (se->last_wakeup)
+ sample -= se->last_wakeup;
+ else
+ sample -= se->start_runtime;
+ update_avg(&se->avg_wakeup, sample);
+
+ se->last_wakeup = se->sum_exec_runtime;
+ }
+
out_running:
trace_sched_wakeup(rq, p, success);
check_preempt_curr(rq, p, sync);
@@ -2355,8 +2432,6 @@ out_running:
p->sched_class->task_wake_up(rq, p);
#endif
out:
- current->se.last_wakeup = current->se.sum_exec_runtime;
-
task_rq_unlock(rq, &flags);
return success;
@@ -2386,6 +2461,8 @@ static void __sched_fork(struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.last_wakeup = 0;
p->se.avg_overlap = 0;
+ p->se.start_runtime = 0;
+ p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
@@ -2448,6 +2525,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
+ plist_node_init(&p->pushable_tasks, MAX_PRIO);
+
put_cpu();
}
@@ -2588,6 +2667,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
{
struct mm_struct *mm = rq->prev_mm;
long prev_state;
+#ifdef CONFIG_SMP
+ int post_schedule = 0;
+
+ if (current->sched_class->needs_post_schedule)
+ post_schedule = current->sched_class->needs_post_schedule(rq);
+#endif
rq->prev_mm = NULL;
@@ -2606,7 +2691,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
finish_arch_switch(prev);
finish_lock_switch(rq, prev);
#ifdef CONFIG_SMP
- if (current->sched_class->post_schedule)
+ if (post_schedule)
current->sched_class->post_schedule(rq);
#endif
@@ -2987,6 +3072,16 @@ next:
pulled++;
rem_load_move -= p->se.load.weight;
+#ifdef CONFIG_PREEMPT
+ /*
+ * NEWIDLE balancing is a source of latency, so preemptible kernels
+ * will stop after the first task is pulled to minimize the critical
+ * section.
+ */
+ if (idle == CPU_NEWLY_IDLE)
+ goto out;
+#endif
+
/*
* We only want to steal up to the prescribed amount of weighted load.
*/
@@ -3033,9 +3128,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
sd, idle, all_pinned, &this_best_prio);
class = class->next;
+#ifdef CONFIG_PREEMPT
+ /*
+ * NEWIDLE balancing is a source of latency, so preemptible
+ * kernels will stop after the first task is pulled to minimize
+ * the critical section.
+ */
if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
break;
-
+#endif
} while (class && max_load_move > total_load_moved);
return total_load_moved > 0;
@@ -4538,15 +4639,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
/*
* schedule() is the main scheduler function.
*/
-asmlinkage void __sched schedule(void)
+asmlinkage void __sched __schedule(void)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq *rq;
int cpu;
-need_resched:
- preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
rcu_qsctr_inc(cpu);
@@ -4603,13 +4702,80 @@ need_resched_nonpreemptible:
if (unlikely(reacquire_kernel_lock(current) < 0))
goto need_resched_nonpreemptible;
+}
+asmlinkage void __sched schedule(void)
+{
+need_resched:
+ preempt_disable();
+ __schedule();
preempt_enable_no_resched();
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
}
EXPORT_SYMBOL(schedule);
+#ifdef CONFIG_SMP
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+{
+ unsigned int cpu;
+ struct rq *rq;
+
+ if (!sched_feat(OWNER_SPIN))
+ return 0;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * Need to access the cpu field knowing that
+ * DEBUG_PAGEALLOC could have unmapped it if
+ * the mutex owner just released it and exited.
+ */
+ if (probe_kernel_address(&owner->cpu, cpu))
+ goto out;
+#else
+ cpu = owner->cpu;
+#endif
+
+ /*
+ * Even if the access succeeded (likely case),
+ * the cpu field may no longer be valid.
+ */
+ if (cpu >= nr_cpumask_bits)
+ goto out;
+
+ /*
+ * We need to validate that we can do a
+ * get_cpu() and that we have the percpu area.
+ */
+ if (!cpu_online(cpu))
+ goto out;
+
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ /*
+ * Owner changed, break to re-assess state.
+ */
+ if (lock->owner != owner)
+ break;
+
+ /*
+ * Is that owner really running on that cpu?
+ */
+ if (task_thread_info(rq->curr) != owner || need_resched())
+ return 0;
+
+ cpu_relax();
+ }
+out:
+ return 1;
+}
+#endif
+
#ifdef CONFIG_PREEMPT
/*
* this is the entry point to schedule() from in-kernel preemption
@@ -5939,12 +6105,7 @@ void sched_show_task(struct task_struct *p)
printk(KERN_CONT " %016lx ", thread_saved_pc(p));
#endif
#ifdef CONFIG_DEBUG_STACK_USAGE
- {
- unsigned long *n = end_of_stack(p);
- while (!*n)
- n++;
- free = (unsigned long)n - (unsigned long)end_of_stack(p);
- }
+ free = stack_not_used(p);
#endif
printk(KERN_CONT "%5lu %5d %6d\n", free,
task_pid_nr(p), task_pid_nr(p->real_parent));
@@ -8204,11 +8365,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
__set_bit(MAX_RT_PRIO, array->bitmap);
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- rt_rq->highest_prio = MAX_RT_PRIO;
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+#ifdef CONFIG_SMP
+ rt_rq->highest_prio.next = MAX_RT_PRIO;
+#endif
#endif
#ifdef CONFIG_SMP
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
+ plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
#endif
rt_rq->rt_time = 0;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 1e00bfacf9b8..cdd3c89574cd 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -55,7 +55,7 @@ static int convert_prio(int prio)
* cpupri_find - find the best (lowest-pri) CPU in the system
* @cp: The cpupri context
* @p: The task
- * @lowest_mask: A mask to fill in with selected CPUs
+ * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
*
* Note: This function returns the recommended CPUs as calculated during the
* current invokation. By the time the call returns, the CPUs may have in
@@ -81,7 +81,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
continue;
- cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+ if (lowest_mask)
+ cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
return 1;
}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 16eeba4e4169..2b1260f0e800 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -397,6 +397,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.vruntime);
PN(se.sum_exec_runtime);
PN(se.avg_overlap);
+ PN(se.avg_wakeup);
nr_switches = p->nvcsw + p->nivcsw;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5cc1c162044f..bc1563e7a248 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -719,7 +719,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
__enqueue_entity(cfs_rq, se);
}
-static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->last == se)
cfs_rq->last = NULL;
@@ -728,6 +728,12 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->next = NULL;
}
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ __clear_buddies(cfs_rq_of(se), se);
+}
+
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
@@ -768,8 +774,14 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime)
+ if (delta_exec > ideal_runtime) {
resched_task(rq_of(cfs_rq)->curr);
+ /*
+ * The current task ran long enough, ensure it doesn't get
+ * re-elected due to buddy favours.
+ */
+ clear_buddies(cfs_rq, curr);
+ }
}
static void
@@ -1179,20 +1191,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
int idx, unsigned long load, unsigned long this_load,
unsigned int imbalance)
{
- struct task_struct *curr = this_rq->curr;
- struct task_group *tg;
unsigned long tl = this_load;
unsigned long tl_per_task;
+ struct task_group *tg;
unsigned long weight;
int balanced;
if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
return 0;
- if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
- p->se.avg_overlap > sysctl_sched_migration_cost))
- sync = 0;
-
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
@@ -1302,16 +1309,63 @@ out:
}
#endif /* CONFIG_SMP */
-static unsigned long wakeup_gran(struct sched_entity *se)
+/*
+ * Adaptive granularity
+ *
+ * se->avg_wakeup gives the average time a task runs until it does a wakeup,
+ * with the limit of wakeup_gran -- when it never does a wakeup.
+ *
+ * So the smaller avg_wakeup is the faster we want this task to preempt,
+ * but we don't want to treat the preemptee unfairly and therefore allow it
+ * to run for at least the amount of time we'd like to run.
+ *
+ * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
+ *
+ * NOTE: we use *nr_running to scale with load, this nicely matches the
+ * degrading latency on load.
+ */
+static unsigned long
+adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
+{
+ u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+ u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
+ u64 gran = 0;
+
+ if (this_run < expected_wakeup)
+ gran = expected_wakeup - this_run;
+
+ return min_t(s64, gran, sysctl_sched_wakeup_granularity);
+}
+
+static unsigned long
+wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
+ if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
+ gran = adaptive_gran(curr, se);
+
/*
- * More easily preempt - nice tasks, while not making it harder for
- * + nice tasks.
+ * Since its curr running now, convert the gran from real-time
+ * to virtual-time in his units.
*/
- if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
- gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
+ if (sched_feat(ASYM_GRAN)) {
+ /*
+ * By using 'se' instead of 'curr' we penalize light tasks, so
+ * they get preempted easier. That is, if 'se' < 'curr' then
+ * the resulting gran will be larger, therefore penalizing the
+ * lighter, if otoh 'se' > 'curr' then the resulting gran will
+ * be smaller, again penalizing the lighter task.
+ *
+ * This is especially important for buddies when the leftmost
+ * task is higher priority than the buddy.
+ */
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ gran = calc_delta_fair(gran, se);
+ } else {
+ if (unlikely(curr->load.weight != NICE_0_LOAD))
+ gran = calc_delta_fair(gran, curr);
+ }
return gran;
}
@@ -1338,7 +1392,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
if (vdiff <= 0)
return -1;
- gran = wakeup_gran(curr);
+ gran = wakeup_gran(curr, se);
if (vdiff > gran)
return 1;
@@ -1419,9 +1473,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
if (!sched_feat(WAKEUP_PREEMPT))
return;
- if (sched_feat(WAKEUP_OVERLAP) && (sync ||
- (se->avg_overlap < sysctl_sched_migration_cost &&
- pse->avg_overlap < sysctl_sched_migration_cost))) {
+ if (sched_feat(WAKEUP_OVERLAP) && sync) {
resched_task(curr);
return;
}
@@ -1452,6 +1504,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
do {
se = pick_next_entity(cfs_rq);
+ /*
+ * If se was a buddy, clear it so that it will have to earn
+ * the favour again.
+ */
+ __clear_buddies(cfs_rq, se);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index da5d93b5d2c6..4569bfa7df9b 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,5 +1,6 @@
SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
-SCHED_FEAT(NORMALIZED_SLEEPER, 1)
+SCHED_FEAT(NORMALIZED_SLEEPER, 0)
+SCHED_FEAT(ADAPTIVE_GRAN, 1)
SCHED_FEAT(WAKEUP_PREEMPT, 1)
SCHED_FEAT(START_DEBIT, 1)
SCHED_FEAT(AFFINE_WAKEUPS, 1)
@@ -13,3 +14,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
SCHED_FEAT(ASYM_EFF_LOAD, 1)
SCHED_FEAT(WAKEUP_OVERLAP, 0)
SCHED_FEAT(LAST_BUDDY, 1)
+SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 954e1a81b796..f2c66f8f9712 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,6 +3,40 @@
* policies)
*/
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+ return container_of(rt_se, struct task_struct, rt);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+ return rt_rq->rq;
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ return rt_se->rt_rq;
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+ return container_of(rt_rq, struct rq, rt);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ struct task_struct *p = rt_task_of(rt_se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->rt;
+}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
#ifdef CONFIG_SMP
static inline int rt_overloaded(struct rq *rq)
@@ -37,25 +71,69 @@ static inline void rt_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}
-static void update_rt_migration(struct rq *rq)
+static void update_rt_migration(struct rt_rq *rt_rq)
{
- if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
- if (!rq->rt.overloaded) {
- rt_set_overload(rq);
- rq->rt.overloaded = 1;
+ if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
+ if (!rt_rq->overloaded) {
+ rt_set_overload(rq_of_rt_rq(rt_rq));
+ rt_rq->overloaded = 1;
}
- } else if (rq->rt.overloaded) {
- rt_clear_overload(rq);
- rq->rt.overloaded = 0;
+ } else if (rt_rq->overloaded) {
+ rt_clear_overload(rq_of_rt_rq(rt_rq));
+ rt_rq->overloaded = 0;
}
}
-#endif /* CONFIG_SMP */
-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
- return container_of(rt_se, struct task_struct, rt);
+ if (rt_se->nr_cpus_allowed > 1)
+ rt_rq->rt_nr_migratory++;
+
+ update_rt_migration(rt_rq);
+}
+
+static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ if (rt_se->nr_cpus_allowed > 1)
+ rt_rq->rt_nr_migratory--;
+
+ update_rt_migration(rt_rq);
+}
+
+static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+ plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ plist_node_init(&p->pushable_tasks, p->prio);
+ plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
}
+static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+ plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+}
+
+#else
+
+static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline
+void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+}
+
+static inline
+void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+}
+
+#endif /* CONFIG_SMP */
+
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
return !list_empty(&rt_se->run_list);
@@ -79,16 +157,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
#define for_each_leaf_rt_rq(rt_rq, rq) \
list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
-{
- return rt_rq->rq;
-}
-
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
- return rt_se->rt_rq;
-}
-
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = rt_se->parent)
@@ -108,7 +176,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
if (rt_rq->rt_nr_running) {
if (rt_se && !on_rt_rq(rt_se))
enqueue_rt_entity(rt_se);
- if (rt_rq->highest_prio < curr->prio)
+ if (rt_rq->highest_prio.curr < curr->prio)
resched_task(curr);
}
}
@@ -176,19 +244,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
#define for_each_leaf_rt_rq(rt_rq, rq) \
for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
-{
- return container_of(rt_rq, struct rq, rt);
-}
-
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
- struct task_struct *p = rt_task_of(rt_se);
- struct rq *rq = task_rq(p);
-
- return &rq->rt;
-}
-
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = NULL)
@@ -473,7 +528,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq)
- return rt_rq->highest_prio;
+ return rt_rq->highest_prio.curr;
#endif
return rt_task_of(rt_se)->prio;
@@ -547,91 +602,174 @@ static void update_curr_rt(struct rq *rq)
}
}
-static inline
-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+#if defined CONFIG_SMP
+
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
+
+static inline int next_prio(struct rq *rq)
{
- WARN_ON(!rt_prio(rt_se_prio(rt_se)));
- rt_rq->rt_nr_running++;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
-#ifdef CONFIG_SMP
- struct rq *rq = rq_of_rt_rq(rt_rq);
-#endif
+ struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
+
+ if (next && rt_prio(next->prio))
+ return next->prio;
+ else
+ return MAX_RT_PRIO;
+}
+
+static void
+inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (prio < prev_prio) {
+
+ /*
+ * If the new task is higher in priority than anything on the
+ * run-queue, we know that the previous high becomes our
+ * next-highest.
+ */
+ rt_rq->highest_prio.next = prev_prio;
- rt_rq->highest_prio = rt_se_prio(rt_se);
-#ifdef CONFIG_SMP
if (rq->online)
- cpupri_set(&rq->rd->cpupri, rq->cpu,
- rt_se_prio(rt_se));
-#endif
- }
-#endif
-#ifdef CONFIG_SMP
- if (rt_se->nr_cpus_allowed > 1) {
- struct rq *rq = rq_of_rt_rq(rt_rq);
+ cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
- rq->rt.rt_nr_migratory++;
- }
+ } else if (prio == rt_rq->highest_prio.curr)
+ /*
+ * If the next task is equal in priority to the highest on
+ * the run-queue, then we implicitly know that the next highest
+ * task cannot be any lower than current
+ */
+ rt_rq->highest_prio.next = prio;
+ else if (prio < rt_rq->highest_prio.next)
+ /*
+ * Otherwise, we need to recompute next-highest
+ */
+ rt_rq->highest_prio.next = next_prio(rq);
+}
- update_rt_migration(rq_of_rt_rq(rt_rq));
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
- if (rt_se_boosted(rt_se))
- rt_rq->rt_nr_boosted++;
+static void
+dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
- if (rt_rq->tg)
- start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
-#else
- start_rt_bandwidth(&def_rt_bandwidth);
-#endif
+ if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
+ rt_rq->highest_prio.next = next_prio(rq);
+
+ if (rq->online && rt_rq->highest_prio.curr != prev_prio)
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
+#else /* CONFIG_SMP */
+
static inline
-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-#ifdef CONFIG_SMP
- int highest_prio = rt_rq->highest_prio;
-#endif
+void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+static inline
+void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+
+#endif /* CONFIG_SMP */
- WARN_ON(!rt_prio(rt_se_prio(rt_se)));
- WARN_ON(!rt_rq->rt_nr_running);
- rt_rq->rt_nr_running--;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+static void
+inc_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+ int prev_prio = rt_rq->highest_prio.curr;
+
+ if (prio < prev_prio)
+ rt_rq->highest_prio.curr = prio;
+
+ inc_rt_prio_smp(rt_rq, prio, prev_prio);
+}
+
+static void
+dec_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+ int prev_prio = rt_rq->highest_prio.curr;
+
if (rt_rq->rt_nr_running) {
- struct rt_prio_array *array;
- WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
- if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
- /* recalculate */
- array = &rt_rq->active;
- rt_rq->highest_prio =
+ WARN_ON(prio < prev_prio);
+
+ /*
+ * This may have been our highest task, and therefore
+ * we may have some recomputation to do
+ */
+ if (prio == prev_prio) {
+ struct rt_prio_array *array = &rt_rq->active;
+
+ rt_rq->highest_prio.curr =
sched_find_first_bit(array->bitmap);
- } /* otherwise leave rq->highest prio alone */
+ }
+
} else
- rt_rq->highest_prio = MAX_RT_PRIO;
-#endif
-#ifdef CONFIG_SMP
- if (rt_se->nr_cpus_allowed > 1) {
- struct rq *rq = rq_of_rt_rq(rt_rq);
- rq->rt.rt_nr_migratory--;
- }
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
- if (rt_rq->highest_prio != highest_prio) {
- struct rq *rq = rq_of_rt_rq(rt_rq);
+ dec_rt_prio_smp(rt_rq, prio, prev_prio);
+}
- if (rq->online)
- cpupri_set(&rq->rd->cpupri, rq->cpu,
- rt_rq->highest_prio);
- }
+#else
+
+static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
+static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+
+#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
- update_rt_migration(rq_of_rt_rq(rt_rq));
-#endif /* CONFIG_SMP */
#ifdef CONFIG_RT_GROUP_SCHED
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted++;
+
+ if (rt_rq->tg)
+ start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+}
+
+static void
+dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
if (rt_se_boosted(rt_se))
rt_rq->rt_nr_boosted--;
WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
-#endif
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ start_rt_bandwidth(&def_rt_bandwidth);
+}
+
+static inline
+void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static inline
+void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ int prio = rt_se_prio(rt_se);
+
+ WARN_ON(!rt_prio(prio));
+ rt_rq->rt_nr_running++;
+
+ inc_rt_prio(rt_rq, prio);
+ inc_rt_migration(rt_se, rt_rq);
+ inc_rt_group(rt_se, rt_rq);
+}
+
+static inline
+void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+ WARN_ON(!rt_rq->rt_nr_running);
+ rt_rq->rt_nr_running--;
+
+ dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+ dec_rt_migration(rt_se, rt_rq);
+ dec_rt_group(rt_se, rt_rq);
}
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -718,6 +856,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
enqueue_rt_entity(rt_se);
+ if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+ enqueue_pushable_task(rq, p);
+
inc_cpu_load(rq, p->se.load.weight);
}
@@ -728,6 +869,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
update_curr_rt(rq);
dequeue_rt_entity(rt_se);
+ dequeue_pushable_task(rq, p);
+
dec_cpu_load(rq, p->se.load.weight);
}
@@ -805,20 +948,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- cpumask_var_t mask;
-
if (rq->curr->rt.nr_cpus_allowed == 1)
return;
- if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
- return;
-
if (p->rt.nr_cpus_allowed != 1
- && cpupri_find(&rq->rd->cpupri, p, mask))
- goto free;
+ && cpupri_find(&rq->rd->cpupri, p, NULL))
+ return;
- if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
- goto free;
+ if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+ return;
/*
* There appears to be other cpus that can accept
@@ -827,8 +965,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
*/
requeue_task_rt(rq, p, 1);
resched_task(rq->curr);
-free:
- free_cpumask_var(mask);
}
#endif /* CONFIG_SMP */
@@ -878,7 +1014,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
return next;
}
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
struct task_struct *p;
@@ -900,6 +1036,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
p = rt_task_of(rt_se);
p->se.exec_start = rq->clock;
+
+ return p;
+}
+
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+ struct task_struct *p = _pick_next_task_rt(rq);
+
+ /* The running task is never eligible for pushing */
+ if (p)
+ dequeue_pushable_task(rq, p);
+
return p;
}
@@ -907,6 +1055,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
update_curr_rt(rq);
p->se.exec_start = 0;
+
+ /*
+ * The previous task needs to be made eligible for pushing
+ * if it is still active
+ */
+ if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+ enqueue_pushable_task(rq, p);
}
#ifdef CONFIG_SMP
@@ -960,16 +1115,17 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
-static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
+static inline int pick_optimal_cpu(int this_cpu,
+ const struct cpumask *mask)
{
int first;
/* "this_cpu" is cheaper to preempt than a remote processor */
- if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
+ if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
return this_cpu;
- first = first_cpu(*mask);
- if (first != NR_CPUS)
+ first = cpumask_first(mask);
+ if (first < nr_cpu_ids)
return first;
return -1;
@@ -981,6 +1137,7 @@ static int find_lowest_rq(struct task_struct *task)
struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
+ cpumask_var_t domain_mask;
if (task->rt.nr_cpus_allowed == 1)
return -1; /* No other targets possible */
@@ -1013,19 +1170,25 @@ static int find_lowest_rq(struct task_struct *task)
if (this_cpu == cpu)
this_cpu = -1; /* Skip this_cpu opt if the same */
- for_each_domain(cpu, sd) {
- if (sd->flags & SD_WAKE_AFFINE) {
- cpumask_t domain_mask;
- int best_cpu;
+ if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
+ for_each_domain(cpu, sd) {
+ if (sd->flags & SD_WAKE_AFFINE) {
+ int best_cpu;
- cpumask_and(&domain_mask, sched_domain_span(sd),
- lowest_mask);
+ cpumask_and(domain_mask,
+ sched_domain_span(sd),
+ lowest_mask);
- best_cpu = pick_optimal_cpu(this_cpu,
- &domain_mask);
- if (best_cpu != -1)
- return best_cpu;
+ best_cpu = pick_optimal_cpu(this_cpu,
+ domain_mask);
+
+ if (best_cpu != -1) {
+ free_cpumask_var(domain_mask);
+ return best_cpu;
+ }
+ }
}
+ free_cpumask_var(domain_mask);
}
/*
@@ -1072,7 +1235,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
}
/* If this rq is still suitable use it. */
- if (lowest_rq->rt.highest_prio > task->prio)
+ if (lowest_rq->rt.highest_prio.curr > task->prio)
break;
/* try again */
@@ -1083,6 +1246,31 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
return lowest_rq;
}
+static inline int has_pushable_tasks(struct rq *rq)
+{
+ return !plist_head_empty(&rq->rt.pushable_tasks);
+}
+
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ p = plist_first_entry(&rq->rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(p->rt.nr_cpus_allowed <= 1);
+
+ BUG_ON(!p->se.on_rq);
+ BUG_ON(!rt_task(p));
+
+ return p;
+}
+
/*
* If the current CPU has more than one RT task, see if the non
* running task can migrate over to a CPU that is running a task
@@ -1092,13 +1280,11 @@ static int push_rt_task(struct rq *rq)
{
struct task_struct *next_task;
struct rq *lowest_rq;
- int ret = 0;
- int paranoid = RT_MAX_TRIES;
if (!rq->rt.overloaded)
return 0;
- next_task = pick_next_highest_task_rt(rq, -1);
+ next_task = pick_next_pushable_task(rq);
if (!next_task)
return 0;
@@ -1127,16 +1313,34 @@ static int push_rt_task(struct rq *rq)
struct task_struct *task;
/*
* find lock_lowest_rq releases rq->lock
- * so it is possible that next_task has changed.
- * If it has, then try again.
+ * so it is possible that next_task has migrated.
+ *
+ * We need to make sure that the task is still on the same
+ * run-queue and is also still the next task eligible for
+ * pushing.
*/
- task = pick_next_highest_task_rt(rq, -1);
- if (unlikely(task != next_task) && task && paranoid--) {
- put_task_struct(next_task);
- next_task = task;
- goto retry;
+ task = pick_next_pushable_task(rq);
+ if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ /*
+ * If we get here, the task hasnt moved at all, but
+ * it has failed to push. We will not try again,
+ * since the other cpus will pull from us when they
+ * are ready.
+ */
+ dequeue_pushable_task(rq, next_task);
+ goto out;
}
- goto out;
+
+ if (!task)
+ /* No more tasks, just exit */
+ goto out;
+
+ /*
+ * Something has shifted, try again.
+ */
+ put_task_struct(next_task);
+ next_task = task;
+ goto retry;
}
deactivate_task(rq, next_task, 0);
@@ -1147,23 +1351,12 @@ static int push_rt_task(struct rq *rq)
double_unlock_balance(rq, lowest_rq);
- ret = 1;
out:
put_task_struct(next_task);
- return ret;
+ return 1;
}
-/*
- * TODO: Currently we just use the second highest prio task on
- * the queue, and stop when it can't migrate (or there's
- * no more RT tasks). There may be a case where a lower
- * priority RT task has a different affinity than the
- * higher RT task. In this case the lower RT task could
- * possibly be able to migrate where as the higher priority
- * RT task could not. We currently ignore this issue.
- * Enhancements are welcome!
- */
static void push_rt_tasks(struct rq *rq)
{
/* push_rt_task will return true if it moved an RT */
@@ -1174,33 +1367,35 @@ static void push_rt_tasks(struct rq *rq)
static int pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, ret = 0, cpu;
- struct task_struct *p, *next;
+ struct task_struct *p;
struct rq *src_rq;
if (likely(!rt_overloaded(this_rq)))
return 0;
- next = pick_next_task_rt(this_rq);
-
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
src_rq = cpu_rq(cpu);
+
+ /*
+ * Don't bother taking the src_rq->lock if the next highest
+ * task is known to be lower-priority than our current task.
+ * This may look racy, but if this value is about to go
+ * logically higher, the src_rq will push this task away.
+ * And if its going logically lower, we do not care
+ */
+ if (src_rq->rt.highest_prio.next >=
+ this_rq->rt.highest_prio.curr)
+ continue;
+
/*
* We can potentially drop this_rq's lock in
* double_lock_balance, and another CPU could
- * steal our next task - hence we must cause
- * the caller to recalculate the next task
- * in that case:
+ * alter this_rq
*/
- if (double_lock_balance(this_rq, src_rq)) {
- struct task_struct *old_next = next;
-
- next = pick_next_task_rt(this_rq);
- if (next != old_next)
- ret = 1;
- }
+ double_lock_balance(this_rq, src_rq);
/*
* Are there still pullable RT tasks?
@@ -1214,7 +1409,7 @@ static int pull_rt_task(struct rq *this_rq)
* Do we have an RT task that preempts
* the to-be-scheduled task?
*/
- if (p && (!next || (p->prio < next->prio))) {
+ if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
WARN_ON(p == src_rq->curr);
WARN_ON(!p->se.on_rq);
@@ -1224,12 +1419,9 @@ static int pull_rt_task(struct rq *this_rq)
* This is just that p is wakeing up and hasn't
* had a chance to schedule. We only pull
* p if it is lower in priority than the
- * current task on the run queue or
- * this_rq next task is lower in prio than
- * the current task on that rq.
+ * current task on the run queue
*/
- if (p->prio < src_rq->curr->prio ||
- (next && next->prio < src_rq->curr->prio))
+ if (p->prio < src_rq->curr->prio)
goto skip;
ret = 1;
@@ -1242,13 +1434,7 @@ static int pull_rt_task(struct rq *this_rq)
* case there's an even higher prio task
* in another runqueue. (low likelyhood
* but possible)
- *
- * Update next so that we won't pick a task
- * on another cpu with a priority lower (or equal)
- * than the one we just picked.
*/
- next = p;
-
}
skip:
double_unlock_balance(this_rq, src_rq);
@@ -1260,24 +1446,27 @@ static int pull_rt_task(struct rq *this_rq)
static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
- if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
+ if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
pull_rt_task(rq);
}
+/*
+ * assumes rq->lock is held
+ */
+static int needs_post_schedule_rt(struct rq *rq)
+{
+ return has_pushable_tasks(rq);
+}
+
static void post_schedule_rt(struct rq *rq)
{
/*
- * If we have more than one rt_task queued, then
- * see if we can push the other rt_tasks off to other CPUS.
- * Note we may release the rq lock, and since
- * the lock was owned by prev, we need to release it
- * first via finish_lock_switch and then reaquire it here.
+ * This is only called if needs_post_schedule_rt() indicates that
+ * we need to push tasks away
*/
- if (unlikely(rq->rt.overloaded)) {
- spin_lock_irq(&rq->lock);
- push_rt_tasks(rq);
- spin_unlock_irq(&rq->lock);
- }
+ spin_lock_irq(&rq->lock);
+ push_rt_tasks(rq);
+ spin_unlock_irq(&rq->lock);
}
/*
@@ -1288,7 +1477,8 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- rq->rt.overloaded)
+ has_pushable_tasks(rq) &&
+ p->rt.nr_cpus_allowed > 1)
push_rt_tasks(rq);
}
@@ -1324,6 +1514,24 @@ static void set_cpus_allowed_rt(struct task_struct *p,
if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
struct rq *rq = task_rq(p);
+ if (!task_current(rq, p)) {
+ /*
+ * Make sure we dequeue this task from the pushable list
+ * before going further. It will either remain off of
+ * the list because we are no longer pushable, or it
+ * will be requeued.
+ */
+ if (p->rt.nr_cpus_allowed > 1)
+ dequeue_pushable_task(rq, p);
+
+ /*
+ * Requeue if our weight is changing and still > 1
+ */
+ if (weight > 1)
+ enqueue_pushable_task(rq, p);
+
+ }
+
if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
rq->rt.rt_nr_migratory++;
} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
@@ -1331,7 +1539,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
rq->rt.rt_nr_migratory--;
}
- update_rt_migration(rq);
+ update_rt_migration(&rq->rt);
}
cpumask_copy(&p->cpus_allowed, new_mask);
@@ -1346,7 +1554,7 @@ static void rq_online_rt(struct rq *rq)
__enable_runtime(rq);
- cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
}
/* Assumes rq->lock is held */
@@ -1438,7 +1646,7 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
* can release the rq lock and p could migrate.
* Only reschedule if p is still on the same runqueue.
*/
- if (p->prio > rq->rt.highest_prio && rq->curr == p)
+ if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
resched_task(p);
#else
/* For UP simply resched on drop of prio */
@@ -1509,6 +1717,9 @@ static void set_curr_task_rt(struct rq *rq)
struct task_struct *p = rq->curr;
p->se.exec_start = rq->clock;
+
+ /* The running task is never eligible for pushing */
+ dequeue_pushable_task(rq, p);
}
static const struct sched_class rt_sched_class = {
@@ -1531,6 +1742,7 @@ static const struct sched_class rt_sched_class = {
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
.pre_schedule = pre_schedule_rt,
+ .needs_post_schedule = needs_post_schedule_rt,
.post_schedule = post_schedule_rt,
.task_wake_up = task_wake_up_rt,
.switched_from = switched_from_rt,
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index f2773b5d1226..8ab0cef8ecab 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -296,6 +296,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
static inline void account_group_user_time(struct task_struct *tsk,
cputime_t cputime)
{
+ struct task_cputime *times;
struct signal_struct *sig;
/* tsk == current, ensure it is safe to use ->signal */
@@ -303,13 +304,11 @@ static inline void account_group_user_time(struct task_struct *tsk,
return;
sig = tsk->signal;
- if (sig->cputime.totals) {
- struct task_cputime *times;
+ times = &sig->cputime.totals;
- times = per_cpu_ptr(sig->cputime.totals, get_cpu());
- times->utime = cputime_add(times->utime, cputime);
- put_cpu_no_resched();
- }
+ spin_lock(&times->lock);
+ times->utime = cputime_add(times->utime, cputime);
+ spin_unlock(&times->lock);
}
/**
@@ -325,6 +324,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
static inline void account_group_system_time(struct task_struct *tsk,
cputime_t cputime)
{
+ struct task_cputime *times;
struct signal_struct *sig;
/* tsk == current, ensure it is safe to use ->signal */
@@ -332,13 +332,11 @@ static inline void account_group_system_time(struct task_struct *tsk,
return;
sig = tsk->signal;
- if (sig->cputime.totals) {
- struct task_cputime *times;
+ times = &sig->cputime.totals;
- times = per_cpu_ptr(sig->cputime.totals, get_cpu());
- times->stime = cputime_add(times->stime, cputime);
- put_cpu_no_resched();
- }
+ spin_lock(&times->lock);
+ times->stime = cputime_add(times->stime, cputime);
+ spin_unlock(&times->lock);
}
/**
@@ -354,6 +352,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
static inline void account_group_exec_runtime(struct task_struct *tsk,
unsigned long long ns)
{
+ struct task_cputime *times;
struct signal_struct *sig;
sig = tsk->signal;
@@ -362,11 +361,9 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
if (unlikely(!sig))
return;
- if (sig->cputime.totals) {
- struct task_cputime *times;
+ times = &sig->cputime.totals;
- times = per_cpu_ptr(sig->cputime.totals, get_cpu());
- times->sum_exec_runtime += ns;
- put_cpu_no_resched();
- }
+ spin_lock(&times->lock);
+ times->sum_exec_runtime += ns;
+ spin_unlock(&times->lock);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index e73759783dc8..b6b36768b758 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -909,7 +909,9 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
}
#endif
printk("\n");
+ preempt_disable();
show_regs(regs);
+ preempt_enable();
}
static int __init setup_print_fatal_signals(char *str)
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
new file mode 100644
index 000000000000..cf2bc01186ef
--- /dev/null
+++ b/kernel/slow-work.c
@@ -0,0 +1,640 @@
+/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ *
+ * See Documentation/slow-work.txt
+ */
+
+#include <linux/module.h>
+#include <linux/slow-work.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+
+#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
+ * things to do */
+#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
+ * OOM */
+
+static void slow_work_cull_timeout(unsigned long);
+static void slow_work_oom_timeout(unsigned long);
+
+#ifdef CONFIG_SYSCTL
+static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
+ void __user *, size_t *, loff_t *);
+
+static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
+ void __user *, size_t *, loff_t *);
+#endif
+
+/*
+ * The pool of threads has at least min threads in it as long as someone is
+ * using the facility, and may have as many as max.
+ *
+ * A portion of the pool may be processing very slow operations.
+ */
+static unsigned slow_work_min_threads = 2;
+static unsigned slow_work_max_threads = 4;
+static unsigned vslow_work_proportion = 50; /* % of threads that may process
+ * very slow work */
+
+#ifdef CONFIG_SYSCTL
+static const int slow_work_min_min_threads = 2;
+static int slow_work_max_max_threads = 255;
+static const int slow_work_min_vslow = 1;
+static const int slow_work_max_vslow = 99;
+
+ctl_table slow_work_sysctls[] = {
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "min-threads",
+ .data = &slow_work_min_threads,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = slow_work_min_threads_sysctl,
+ .extra1 = (void *) &slow_work_min_min_threads,
+ .extra2 = &slow_work_max_threads,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max-threads",
+ .data = &slow_work_max_threads,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = slow_work_max_threads_sysctl,
+ .extra1 = &slow_work_min_threads,
+ .extra2 = (void *) &slow_work_max_max_threads,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "vslow-percentage",
+ .data = &vslow_work_proportion,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = (void *) &slow_work_min_vslow,
+ .extra2 = (void *) &slow_work_max_vslow,
+ },
+ { .ctl_name = 0 }
+};
+#endif
+
+/*
+ * The active state of the thread pool
+ */
+static atomic_t slow_work_thread_count;
+static atomic_t vslow_work_executing_count;
+
+static bool slow_work_may_not_start_new_thread;
+static bool slow_work_cull; /* cull a thread due to lack of activity */
+static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
+static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
+static struct slow_work slow_work_new_thread; /* new thread starter */
+
+/*
+ * The queues of work items and the lock governing access to them. These are
+ * shared between all the CPUs. It doesn't make sense to have per-CPU queues
+ * as the number of threads bears no relation to the number of CPUs.
+ *
+ * There are two queues of work items: one for slow work items, and one for
+ * very slow work items.
+ */
+static LIST_HEAD(slow_work_queue);
+static LIST_HEAD(vslow_work_queue);
+static DEFINE_SPINLOCK(slow_work_queue_lock);
+
+/*
+ * The thread controls. A variable used to signal to the threads that they
+ * should exit when the queue is empty, a waitqueue used by the threads to wait
+ * for signals, and a completion set by the last thread to exit.
+ */
+static bool slow_work_threads_should_exit;
+static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
+static DECLARE_COMPLETION(slow_work_last_thread_exited);
+
+/*
+ * The number of users of the thread pool and its lock. Whilst this is zero we
+ * have no threads hanging around, and when this reaches zero, we wait for all
+ * active or queued work items to complete and kill all the threads we do have.
+ */
+static int slow_work_user_count;
+static DEFINE_MUTEX(slow_work_user_lock);
+
+/*
+ * Calculate the maximum number of active threads in the pool that are
+ * permitted to process very slow work items.
+ *
+ * The answer is rounded up to at least 1, but may not equal or exceed the
+ * maximum number of the threads in the pool. This means we always have at
+ * least one thread that can process slow work items, and we always have at
+ * least one thread that won't get tied up doing so.
+ */
+static unsigned slow_work_calc_vsmax(void)
+{
+ unsigned vsmax;
+
+ vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
+ vsmax /= 100;
+ vsmax = max(vsmax, 1U);
+ return min(vsmax, slow_work_max_threads - 1);
+}
+
+/*
+ * Attempt to execute stuff queued on a slow thread. Return true if we managed
+ * it, false if there was nothing to do.
+ */
+static bool slow_work_execute(void)
+{
+ struct slow_work *work = NULL;
+ unsigned vsmax;
+ bool very_slow;
+
+ vsmax = slow_work_calc_vsmax();
+
+ /* see if we can schedule a new thread to be started if we're not
+ * keeping up with the work */
+ if (!waitqueue_active(&slow_work_thread_wq) &&
+ (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
+ atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
+ !slow_work_may_not_start_new_thread)
+ slow_work_enqueue(&slow_work_new_thread);
+
+ /* find something to execute */
+ spin_lock_irq(&slow_work_queue_lock);
+ if (!list_empty(&vslow_work_queue) &&
+ atomic_read(&vslow_work_executing_count) < vsmax) {
+ work = list_entry(vslow_work_queue.next,
+ struct slow_work, link);
+ if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+ BUG();
+ list_del_init(&work->link);
+ atomic_inc(&vslow_work_executing_count);
+ very_slow = true;
+ } else if (!list_empty(&slow_work_queue)) {
+ work = list_entry(slow_work_queue.next,
+ struct slow_work, link);
+ if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+ BUG();
+ list_del_init(&work->link);
+ very_slow = false;
+ } else {
+ very_slow = false; /* avoid the compiler warning */
+ }
+ spin_unlock_irq(&slow_work_queue_lock);
+
+ if (!work)
+ return false;
+
+ if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
+ BUG();
+
+ work->ops->execute(work);
+
+ if (very_slow)
+ atomic_dec(&vslow_work_executing_count);
+ clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
+
+ /* if someone tried to enqueue the item whilst we were executing it,
+ * then it'll be left unenqueued to avoid multiple threads trying to
+ * execute it simultaneously
+ *
+ * there is, however, a race between us testing the pending flag and
+ * getting the spinlock, and between the enqueuer setting the pending
+ * flag and getting the spinlock, so we use a deferral bit to tell us
+ * if the enqueuer got there first
+ */
+ if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
+ spin_lock_irq(&slow_work_queue_lock);
+
+ if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
+ test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
+ goto auto_requeue;
+
+ spin_unlock_irq(&slow_work_queue_lock);
+ }
+
+ work->ops->put_ref(work);
+ return true;
+
+auto_requeue:
+ /* we must complete the enqueue operation
+ * - we transfer our ref on the item back to the appropriate queue
+ * - don't wake another thread up as we're awake already
+ */
+ if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+ list_add_tail(&work->link, &vslow_work_queue);
+ else
+ list_add_tail(&work->link, &slow_work_queue);
+ spin_unlock_irq(&slow_work_queue_lock);
+ return true;
+}
+
+/**
+ * slow_work_enqueue - Schedule a slow work item for processing
+ * @work: The work item to queue
+ *
+ * Schedule a slow work item for processing. If the item is already undergoing
+ * execution, this guarantees not to re-enter the execution routine until the
+ * first execution finishes.
+ *
+ * The item is pinned by this function as it retains a reference to it, managed
+ * through the item operations. The item is unpinned once it has been
+ * executed.
+ *
+ * An item may hog the thread that is running it for a relatively large amount
+ * of time, sufficient, for example, to perform several lookup, mkdir, create
+ * and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
+ *
+ * Conversely, if a number of items are awaiting processing, it may take some
+ * time before any given item is given attention. The number of threads in the
+ * pool may be increased to deal with demand, but only up to a limit.
+ *
+ * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
+ * the very slow queue, from which only a portion of the threads will be
+ * allowed to pick items to execute. This ensures that very slow items won't
+ * overly block ones that are just ordinarily slow.
+ *
+ * Returns 0 if successful, -EAGAIN if not.
+ */
+int slow_work_enqueue(struct slow_work *work)
+{
+ unsigned long flags;
+
+ BUG_ON(slow_work_user_count <= 0);
+ BUG_ON(!work);
+ BUG_ON(!work->ops);
+ BUG_ON(!work->ops->get_ref);
+
+ /* when honouring an enqueue request, we only promise that we will run
+ * the work function in the future; we do not promise to run it once
+ * per enqueue request
+ *
+ * we use the PENDING bit to merge together repeat requests without
+ * having to disable IRQs and take the spinlock, whilst still
+ * maintaining our promise
+ */
+ if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
+ spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+ /* we promise that we will not attempt to execute the work
+ * function in more than one thread simultaneously
+ *
+ * this, however, leaves us with a problem if we're asked to
+ * enqueue the work whilst someone is executing the work
+ * function as simply queueing the work immediately means that
+ * another thread may try executing it whilst it is already
+ * under execution
+ *
+ * to deal with this, we set the ENQ_DEFERRED bit instead of
+ * enqueueing, and the thread currently executing the work
+ * function will enqueue the work item when the work function
+ * returns and it has cleared the EXECUTING bit
+ */
+ if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
+ set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
+ } else {
+ if (work->ops->get_ref(work) < 0)
+ goto cant_get_ref;
+ if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+ list_add_tail(&work->link, &vslow_work_queue);
+ else
+ list_add_tail(&work->link, &slow_work_queue);
+ wake_up(&slow_work_thread_wq);
+ }
+
+ spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+ }
+ return 0;
+
+cant_get_ref:
+ spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+ return -EAGAIN;
+}
+EXPORT_SYMBOL(slow_work_enqueue);
+
+/*
+ * Worker thread culling algorithm
+ */
+static bool slow_work_cull_thread(void)
+{
+ unsigned long flags;
+ bool do_cull = false;
+
+ spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+ if (slow_work_cull) {
+ slow_work_cull = false;
+
+ if (list_empty(&slow_work_queue) &&
+ list_empty(&vslow_work_queue) &&
+ atomic_read(&slow_work_thread_count) >
+ slow_work_min_threads) {
+ mod_timer(&slow_work_cull_timer,
+ jiffies + SLOW_WORK_CULL_TIMEOUT);
+ do_cull = true;
+ }
+ }
+
+ spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+ return do_cull;
+}
+
+/*
+ * Determine if there is slow work available for dispatch
+ */
+static inline bool slow_work_available(int vsmax)
+{
+ return !list_empty(&slow_work_queue) ||
+ (!list_empty(&vslow_work_queue) &&
+ atomic_read(&vslow_work_executing_count) < vsmax);
+}
+
+/*
+ * Worker thread dispatcher
+ */
+static int slow_work_thread(void *_data)
+{
+ int vsmax;
+
+ DEFINE_WAIT(wait);
+
+ set_freezable();
+ set_user_nice(current, -5);
+
+ for (;;) {
+ vsmax = vslow_work_proportion;
+ vsmax *= atomic_read(&slow_work_thread_count);
+ vsmax /= 100;
+
+ prepare_to_wait(&slow_work_thread_wq, &wait,
+ TASK_INTERRUPTIBLE);
+ if (!freezing(current) &&
+ !slow_work_threads_should_exit &&
+ !slow_work_available(vsmax) &&
+ !slow_work_cull)
+ schedule();
+ finish_wait(&slow_work_thread_wq, &wait);
+
+ try_to_freeze();
+
+ vsmax = vslow_work_proportion;
+ vsmax *= atomic_read(&slow_work_thread_count);
+ vsmax /= 100;
+
+ if (slow_work_available(vsmax) && slow_work_execute()) {
+ cond_resched();
+ if (list_empty(&slow_work_queue) &&
+ list_empty(&vslow_work_queue) &&
+ atomic_read(&slow_work_thread_count) >
+ slow_work_min_threads)
+ mod_timer(&slow_work_cull_timer,
+ jiffies + SLOW_WORK_CULL_TIMEOUT);
+ continue;
+ }
+
+ if (slow_work_threads_should_exit)
+ break;
+
+ if (slow_work_cull && slow_work_cull_thread())
+ break;
+ }
+
+ if (atomic_dec_and_test(&slow_work_thread_count))
+ complete_and_exit(&slow_work_last_thread_exited, 0);
+ return 0;
+}
+
+/*
+ * Handle thread cull timer expiration
+ */
+static void slow_work_cull_timeout(unsigned long data)
+{
+ slow_work_cull = true;
+ wake_up(&slow_work_thread_wq);
+}
+
+/*
+ * Get a reference on slow work thread starter
+ */
+static int slow_work_new_thread_get_ref(struct slow_work *work)
+{
+ return 0;
+}
+
+/*
+ * Drop a reference on slow work thread starter
+ */
+static void slow_work_new_thread_put_ref(struct slow_work *work)
+{
+}
+
+/*
+ * Start a new slow work thread
+ */
+static void slow_work_new_thread_execute(struct slow_work *work)
+{
+ struct task_struct *p;
+
+ if (slow_work_threads_should_exit)
+ return;
+
+ if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
+ return;
+
+ if (!mutex_trylock(&slow_work_user_lock))
+ return;
+
+ slow_work_may_not_start_new_thread = true;
+ atomic_inc(&slow_work_thread_count);
+ p = kthread_run(slow_work_thread, NULL, "kslowd");
+ if (IS_ERR(p)) {
+ printk(KERN_DEBUG "Slow work thread pool: OOM\n");
+ if (atomic_dec_and_test(&slow_work_thread_count))
+ BUG(); /* we're running on a slow work thread... */
+ mod_timer(&slow_work_oom_timer,
+ jiffies + SLOW_WORK_OOM_TIMEOUT);
+ } else {
+ /* ratelimit the starting of new threads */
+ mod_timer(&slow_work_oom_timer, jiffies + 1);
+ }
+
+ mutex_unlock(&slow_work_user_lock);
+}
+
+static const struct slow_work_ops slow_work_new_thread_ops = {
+ .get_ref = slow_work_new_thread_get_ref,
+ .put_ref = slow_work_new_thread_put_ref,
+ .execute = slow_work_new_thread_execute,
+};
+
+/*
+ * post-OOM new thread start suppression expiration
+ */
+static void slow_work_oom_timeout(unsigned long data)
+{
+ slow_work_may_not_start_new_thread = false;
+}
+
+#ifdef CONFIG_SYSCTL
+/*
+ * Handle adjustment of the minimum number of threads
+ */
+static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ int n;
+
+ if (ret == 0) {
+ mutex_lock(&slow_work_user_lock);
+ if (slow_work_user_count > 0) {
+ /* see if we need to start or stop threads */
+ n = atomic_read(&slow_work_thread_count) -
+ slow_work_min_threads;
+
+ if (n < 0 && !slow_work_may_not_start_new_thread)
+ slow_work_enqueue(&slow_work_new_thread);
+ else if (n > 0)
+ mod_timer(&slow_work_cull_timer,
+ jiffies + SLOW_WORK_CULL_TIMEOUT);
+ }
+ mutex_unlock(&slow_work_user_lock);
+ }
+
+ return ret;
+}
+
+/*
+ * Handle adjustment of the maximum number of threads
+ */
+static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ int n;
+
+ if (ret == 0) {
+ mutex_lock(&slow_work_user_lock);
+ if (slow_work_user_count > 0) {
+ /* see if we need to stop threads */
+ n = slow_work_max_threads -
+ atomic_read(&slow_work_thread_count);
+
+ if (n < 0)
+ mod_timer(&slow_work_cull_timer,
+ jiffies + SLOW_WORK_CULL_TIMEOUT);
+ }
+ mutex_unlock(&slow_work_user_lock);
+ }
+
+ return ret;
+}
+#endif /* CONFIG_SYSCTL */
+
+/**
+ * slow_work_register_user - Register a user of the facility
+ *
+ * Register a user of the facility, starting up the initial threads if there
+ * aren't any other users at this point. This will return 0 if successful, or
+ * an error if not.
+ */
+int slow_work_register_user(void)
+{
+ struct task_struct *p;
+ int loop;
+
+ mutex_lock(&slow_work_user_lock);
+
+ if (slow_work_user_count == 0) {
+ printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
+ init_completion(&slow_work_last_thread_exited);
+
+ slow_work_threads_should_exit = false;
+ slow_work_init(&slow_work_new_thread,
+ &slow_work_new_thread_ops);
+ slow_work_may_not_start_new_thread = false;
+ slow_work_cull = false;
+
+ /* start the minimum number of threads */
+ for (loop = 0; loop < slow_work_min_threads; loop++) {
+ atomic_inc(&slow_work_thread_count);
+ p = kthread_run(slow_work_thread, NULL, "kslowd");
+ if (IS_ERR(p))
+ goto error;
+ }
+ printk(KERN_NOTICE "Slow work thread pool: Ready\n");
+ }
+
+ slow_work_user_count++;
+ mutex_unlock(&slow_work_user_lock);
+ return 0;
+
+error:
+ if (atomic_dec_and_test(&slow_work_thread_count))
+ complete(&slow_work_last_thread_exited);
+ if (loop > 0) {
+ printk(KERN_ERR "Slow work thread pool:"
+ " Aborting startup on ENOMEM\n");
+ slow_work_threads_should_exit = true;
+ wake_up_all(&slow_work_thread_wq);
+ wait_for_completion(&slow_work_last_thread_exited);
+ printk(KERN_ERR "Slow work thread pool: Aborted\n");
+ }
+ mutex_unlock(&slow_work_user_lock);
+ return PTR_ERR(p);
+}
+EXPORT_SYMBOL(slow_work_register_user);
+
+/**
+ * slow_work_unregister_user - Unregister a user of the facility
+ *
+ * Unregister a user of the facility, killing all the threads if this was the
+ * last one.
+ */
+void slow_work_unregister_user(void)
+{
+ mutex_lock(&slow_work_user_lock);
+
+ BUG_ON(slow_work_user_count <= 0);
+
+ slow_work_user_count--;
+ if (slow_work_user_count == 0) {
+ printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
+ slow_work_threads_should_exit = true;
+ wake_up_all(&slow_work_thread_wq);
+ wait_for_completion(&slow_work_last_thread_exited);
+ printk(KERN_NOTICE "Slow work thread pool:"
+ " Shut down complete\n");
+ }
+
+ del_timer_sync(&slow_work_cull_timer);
+
+ mutex_unlock(&slow_work_user_lock);
+}
+EXPORT_SYMBOL(slow_work_unregister_user);
+
+/*
+ * Initialise the slow work facility
+ */
+static int __init init_slow_work(void)
+{
+ unsigned nr_cpus = num_possible_cpus();
+
+ if (slow_work_max_threads < nr_cpus)
+ slow_work_max_threads = nr_cpus;
+#ifdef CONFIG_SYSCTL
+ if (slow_work_max_max_threads < nr_cpus * 2)
+ slow_work_max_max_threads = nr_cpus * 2;
+#endif
+ return 0;
+}
+
+subsys_initcall(init_slow_work);
diff --git a/kernel/smp.c b/kernel/smp.c
index 5cfa0e5e3e88..bbedbb7efe32 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -18,6 +18,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
enum {
CSD_FLAG_WAIT = 0x01,
CSD_FLAG_ALLOC = 0x02,
+ CSD_FLAG_LOCK = 0x04,
};
struct call_function_data {
@@ -186,6 +187,9 @@ void generic_smp_call_function_single_interrupt(void)
if (data_flags & CSD_FLAG_WAIT) {
smp_wmb();
data->flags &= ~CSD_FLAG_WAIT;
+ } else if (data_flags & CSD_FLAG_LOCK) {
+ smp_wmb();
+ data->flags &= ~CSD_FLAG_LOCK;
} else if (data_flags & CSD_FLAG_ALLOC)
kfree(data);
}
@@ -196,6 +200,8 @@ void generic_smp_call_function_single_interrupt(void)
}
}
+static DEFINE_PER_CPU(struct call_single_data, csd_data);
+
/*
* smp_call_function_single - Run a function on a specific CPU
* @func: The function to run. This must be fast and non-blocking.
@@ -224,14 +230,38 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
func(info);
local_irq_restore(flags);
} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
- struct call_single_data *data = NULL;
+ struct call_single_data *data;
if (!wait) {
+ /*
+ * We are calling a function on a single CPU
+ * and we are not going to wait for it to finish.
+ * We first try to allocate the data, but if we
+ * fail, we fall back to use a per cpu data to pass
+ * the information to that CPU. Since all callers
+ * of this code will use the same data, we must
+ * synchronize the callers to prevent a new caller
+ * from corrupting the data before the callee
+ * can access it.
+ *
+ * The CSD_FLAG_LOCK is used to let us know when
+ * the IPI handler is done with the data.
+ * The first caller will set it, and the callee
+ * will clear it. The next caller must wait for
+ * it to clear before we set it again. This
+ * will make sure the callee is done with the
+ * data before a new caller will use it.
+ */
data = kmalloc(sizeof(*data), GFP_ATOMIC);
if (data)
data->flags = CSD_FLAG_ALLOC;
- }
- if (!data) {
+ else {
+ data = &per_cpu(csd_data, me);
+ while (data->flags & CSD_FLAG_LOCK)
+ cpu_relax();
+ data->flags = CSD_FLAG_LOCK;
+ }
+ } else {
data = &d;
data->flags = CSD_FLAG_WAIT;
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..7e93870cf3ea 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -361,6 +361,17 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
EXPORT_SYMBOL(__tasklet_hi_schedule);
+void __tasklet_hi_schedule_first(struct tasklet_struct *t)
+{
+ BUG_ON(!irqs_disabled());
+
+ t->next = __get_cpu_var(tasklet_hi_vec).head;
+ __get_cpu_var(tasklet_hi_vec).head = t;
+ __raise_softirq_irqoff(HI_SOFTIRQ);
+}
+
+EXPORT_SYMBOL(__tasklet_hi_schedule_first);
+
static void tasklet_action(struct softirq_action *a)
{
struct tasklet_struct *list;
@@ -795,6 +806,11 @@ int __init __weak early_irq_init(void)
return 0;
}
+int __init __weak arch_probe_nr_irqs(void)
+{
+ return 0;
+}
+
int __init __weak arch_early_irq_init(void)
{
return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d9188c66278a..88796c330838 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -16,6 +16,7 @@
#include <linux/lockdep.h>
#include <linux/notifier.h>
#include <linux/module.h>
+#include <linux/sysctl.h>
#include <asm/irq_regs.h>
@@ -88,6 +89,14 @@ void touch_all_softlockup_watchdogs(void)
}
EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
+int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ touch_all_softlockup_watchdogs();
+ return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+}
+
/*
* This callback runs from the timer interrupt, and checks
* whether the watchdog thread has hung or not:
@@ -157,97 +166,11 @@ void softlockup_tick(void)
}
/*
- * Have a reasonable limit on the number of tasks checked:
- */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
-
-/*
- * Zero means infinite timeout - no checking done:
- */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
-
-unsigned long __read_mostly sysctl_hung_task_warnings = 10;
-
-/*
- * Only do the hung-tasks check on one CPU:
- */
-static int check_cpu __read_mostly = -1;
-
-static void check_hung_task(struct task_struct *t, unsigned long now)
-{
- unsigned long switch_count = t->nvcsw + t->nivcsw;
-
- if (t->flags & PF_FROZEN)
- return;
-
- if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
- t->last_switch_count = switch_count;
- t->last_switch_timestamp = now;
- return;
- }
- if ((long)(now - t->last_switch_timestamp) <
- sysctl_hung_task_timeout_secs)
- return;
- if (!sysctl_hung_task_warnings)
- return;
- sysctl_hung_task_warnings--;
-
- /*
- * Ok, the task did not get scheduled for more than 2 minutes,
- * complain:
- */
- printk(KERN_ERR "INFO: task %s:%d blocked for more than "
- "%ld seconds.\n", t->comm, t->pid,
- sysctl_hung_task_timeout_secs);
- printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
- " disables this message.\n");
- sched_show_task(t);
- __debug_show_held_locks(t);
-
- t->last_switch_timestamp = now;
- touch_nmi_watchdog();
-
- if (softlockup_panic)
- panic("softlockup: blocked tasks");
-}
-
-/*
- * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
- * a really long time (120 seconds). If that happens, print out
- * a warning.
- */
-static void check_hung_uninterruptible_tasks(int this_cpu)
-{
- int max_count = sysctl_hung_task_check_count;
- unsigned long now = get_timestamp(this_cpu);
- struct task_struct *g, *t;
-
- /*
- * If the system crashed already then all bets are off,
- * do not report extra hung tasks:
- */
- if (test_taint(TAINT_DIE) || did_panic)
- return;
-
- read_lock(&tasklist_lock);
- do_each_thread(g, t) {
- if (!--max_count)
- goto unlock;
- /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
- if (t->state == TASK_UNINTERRUPTIBLE)
- check_hung_task(t, now);
- } while_each_thread(g, t);
- unlock:
- read_unlock(&tasklist_lock);
-}
-
-/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void *__bind_cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
- int this_cpu = (long)__bind_cpu;
sched_setscheduler(current, SCHED_FIFO, &param);
@@ -267,11 +190,6 @@ static int watchdog(void *__bind_cpu)
if (kthread_should_stop())
break;
- if (this_cpu == check_cpu) {
- if (sysctl_hung_task_timeout_secs)
- check_hung_uninterruptible_tasks(this_cpu);
- }
-
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
@@ -303,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- check_cpu = cpumask_any(cpu_online_mask);
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- if (hotcpu == check_cpu) {
- /* Pick any other online cpu. */
- check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
- }
- break;
-
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
if (!per_cpu(watchdog_task, hotcpu))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 368d1638ee78..d99d2e86cbe2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
#include <linux/security.h>
#include <linux/ctype.h>
#include <linux/utsname.h>
+#include <linux/kmemcheck.h>
#include <linux/smp_lock.h>
#include <linux/fs.h>
#include <linux/init.h>
@@ -48,6 +49,7 @@
#include <linux/acpi.h>
#include <linux/reboot.h>
#include <linux/ftrace.h>
+#include <linux/slow-work.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -809,11 +811,24 @@ static struct ctl_table kern_table[] = {
.data = &softlockup_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
+ .proc_handler = &proc_dosoftlockup_thresh,
.strategy = &sysctl_intvec,
.extra1 = &neg_one,
.extra2 = &sixty,
},
+#endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_panic",
+ .data = &sysctl_hung_task_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
{
.ctl_name = CTL_UNNUMBERED,
.procname = "hung_task_check_count",
@@ -829,7 +844,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_hung_task_timeout_secs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = &proc_doulongvec_minmax,
+ .proc_handler = &proc_dohung_task_timeout_secs,
.strategy = &sysctl_intvec,
},
{
@@ -889,6 +904,16 @@ static struct ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
+#ifdef CONFIG_KMEMCHECK
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "kmemcheck",
+ .data = &kmemcheck_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
#ifdef CONFIG_UNEVICTABLE_LRU
{
.ctl_name = CTL_UNNUMBERED,
@@ -899,6 +924,14 @@ static struct ctl_table kern_table[] = {
.proc_handler = &scan_unevictable_handler,
},
#endif
+#ifdef CONFIG_SLOW_WORK
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "slow-work",
+ .mode = 0555,
+ .child = slow_work_sysctls,
+ },
+#endif
/*
* NOTE: do not add new entries to this table unless you have read
* Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index fafeb48f27c0..b38423ca711a 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -219,6 +219,7 @@ static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
{ NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
{ NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
{ NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
+ { NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
{}
};
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index ea2f48af83cf..d13be216a790 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -68,6 +68,17 @@ void clockevents_set_mode(struct clock_event_device *dev,
if (dev->mode != mode) {
dev->set_mode(mode, dev);
dev->mode = mode;
+
+ /*
+ * A nsec2cyc multiplicator of 0 is invalid and we'd crash
+ * on it, so fix it up and emit a warning:
+ */
+ if (mode == CLOCK_EVT_MODE_ONESHOT) {
+ if (unlikely(!dev->mult)) {
+ dev->mult = 1;
+ WARN_ON(1);
+ }
+ }
}
}
@@ -168,15 +179,6 @@ void clockevents_register_device(struct clock_event_device *dev)
BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
BUG_ON(!dev->cpumask);
- /*
- * A nsec2cyc multiplicator of 0 is invalid and we'd crash
- * on it, so fix it up and emit a warning:
- */
- if (unlikely(!dev->mult)) {
- dev->mult = 1;
- WARN_ON(1);
- }
-
spin_lock(&clockevents_lock);
list_add(&dev->list, &clockevent_devices);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 63e05d423a09..21a5ca849514 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -274,6 +274,21 @@ out_bc:
}
/*
+ * Transfer the do_timer job away from a dying cpu.
+ *
+ * Called with interrupts disabled.
+ */
+static void tick_handover_do_timer(int *cpup)
+{
+ if (*cpup == tick_do_timer_cpu) {
+ int cpu = cpumask_first(cpu_online_mask);
+
+ tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
+ TICK_DO_TIMER_NONE;
+ }
+}
+
+/*
* Shutdown an event device on a given cpu:
*
* This is called on a life CPU, when a CPU is dead. So we cannot
@@ -297,13 +312,6 @@ static void tick_shutdown(unsigned int *cpup)
clockevents_exchange_device(dev, NULL);
td->evtdev = NULL;
}
- /* Transfer the do_timer job away from this cpu */
- if (*cpup == tick_do_timer_cpu) {
- int cpu = cpumask_first(cpu_online_mask);
-
- tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
- TICK_DO_TIMER_NONE;
- }
spin_unlock_irqrestore(&tick_device_lock, flags);
}
@@ -357,6 +365,10 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
tick_broadcast_oneshot_control(reason);
break;
+ case CLOCK_EVT_NOTIFY_CPU_DYING:
+ tick_handover_do_timer(dev);
+ break;
+
case CLOCK_EVT_NOTIFY_CPU_DEAD:
tick_shutdown_broadcast_oneshot(dev);
tick_shutdown_broadcast(dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1b6c05bd0d0a..d3f1ef4d5cbe 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,7 +134,7 @@ __setup("nohz=", setup_tick_nohz);
* value. We do this unconditionally on any cpu, as we don't know whether the
* cpu, which has the update task assigned is in a long sleep.
*/
-void tick_nohz_update_jiffies(void)
+static void tick_nohz_update_jiffies(void)
{
int cpu = smp_processor_id();
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e2a4ff6fc3a6..dde1d46f77e5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -264,6 +264,38 @@ config HW_BRANCH_TRACER
This tracer records all branches on the system in a circular
buffer giving access to the last N branches for each cpu.
+config KMEMTRACE
+ bool "Trace SLAB allocations"
+ select TRACING
+ help
+ kmemtrace provides tracing for slab allocator functions, such as
+ kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
+ data is then fed to the userspace application in order to analyse
+ allocation hotspots, internal fragmentation and so on, making it
+ possible to see how well an allocator performs, as well as debug
+ and profile kernel code.
+
+ This requires an userspace application to use. See
+ Documentation/vm/kmemtrace.txt for more information.
+
+ Saying Y will make the kernel somewhat larger and slower. However,
+ if you disable kmemtrace at run-time or boot-time, the performance
+ impact is minimal (depending on the arch the kernel is built for).
+
+ If unsure, say N.
+
+config WORKQUEUE_TRACER
+ bool "Trace workqueues"
+ select TRACING
+ help
+ The workqueue tracer provides some statistical informations
+ about each cpu workqueue thread such as the number of the
+ works inserted and executed since their creation. It can help
+ to evaluate the amount of work each of them have to perform.
+ For example it can help a developer to decide whether he should
+ choose a per cpu workqueue instead of a singlethreaded one.
+
+
config DYNAMIC_FTRACE
bool "enable/disable ftrace tracepoints dynamically"
depends on FUNCTION_TRACER
@@ -302,4 +334,27 @@ config FTRACE_STARTUP_TEST
functioning properly. It will do tests on all the configured
tracers of ftrace.
+config MMIOTRACE
+ bool "Memory mapped IO tracing"
+ depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI
+ select TRACING
+ help
+ Mmiotrace traces Memory Mapped I/O access and is meant for
+ debugging and reverse engineering. It is called from the ioremap
+ implementation and works via page faults. Tracing is disabled by
+ default and can be enabled at run-time.
+
+ See Documentation/tracers/mmiotrace.txt.
+ If you are not helping to develop drivers, say N.
+
+config MMIOTRACE_TEST
+ tristate "Test module for mmiotrace"
+ depends on MMIOTRACE && m
+ help
+ This is a dumb module for testing mmiotrace. It is very dangerous
+ as it will write garbage to IO memory starting at a given address.
+ However, it should be safe to use on e.g. unused portion of VRAM.
+
+ Say N, unless you absolutely know what you are doing.
+
endmenu
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 349d5a93653f..f76d48f3527d 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -19,6 +19,8 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_TRACING) += trace_output.o
+obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
@@ -33,5 +35,7 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
obj-$(CONFIG_POWER_TRACER) += trace_power.o
+obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
+obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2f32969c09df..7e9a20b69939 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -17,6 +17,7 @@
#include <linux/clocksource.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
+#include <linux/suspend.h>
#include <linux/debugfs.h>
#include <linux/hardirq.h>
#include <linux/kthread.h>
@@ -263,14 +264,6 @@ static void ftrace_update_pid_func(void)
# error Dynamic ftrace depends on MCOUNT_RECORD
#endif
-/*
- * Since MCOUNT_ADDR may point to mcount itself, we do not want
- * to get it confused by reading a reference in the code as we
- * are parsing on objcopy output of text. Use a variable for
- * it instead.
- */
-static unsigned long mcount_addr = MCOUNT_ADDR;
-
enum {
FTRACE_ENABLE_CALLS = (1 << 0),
FTRACE_DISABLE_CALLS = (1 << 1),
@@ -289,7 +282,7 @@ static DEFINE_MUTEX(ftrace_regex_lock);
struct ftrace_page {
struct ftrace_page *next;
- unsigned long index;
+ int index;
struct dyn_ftrace records[];
};
@@ -463,7 +456,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
unsigned long ip, fl;
unsigned long ftrace_addr;
- ftrace_addr = (unsigned long)ftrace_caller;
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
ip = rec->ip;
@@ -575,7 +568,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
ip = rec->ip;
- ret = ftrace_make_nop(mod, rec, mcount_addr);
+ ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
if (ret) {
ftrace_bug(ret, ip);
rec->flags |= FTRACE_FL_FAILED;
@@ -786,7 +779,7 @@ enum {
struct ftrace_iterator {
struct ftrace_page *pg;
- unsigned idx;
+ int idx;
unsigned flags;
unsigned char buffer[FTRACE_BUFF_MAX+1];
unsigned buffer_idx;
@@ -1902,7 +1895,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
}
/**
- * unregister_ftrace_function - unresgister a function for profiling.
+ * unregister_ftrace_function - unregister a function for profiling.
* @ops - ops structure that holds the function to unregister
*
* Unregister a function that was added to be called by ftrace profiling.
@@ -1965,6 +1958,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static atomic_t ftrace_graph_active;
+static struct notifier_block ftrace_suspend_notifier;
int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
{
@@ -2043,6 +2037,27 @@ static int start_graph_tracing(void)
return ret;
}
+/*
+ * Hibernation protection.
+ * The state of the current task is too much unstable during
+ * suspend/restore to disk. We want to protect against that.
+ */
+static int
+ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
+ void *unused)
+{
+ switch (state) {
+ case PM_HIBERNATION_PREPARE:
+ pause_graph_tracing();
+ break;
+
+ case PM_POST_HIBERNATION:
+ unpause_graph_tracing();
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
int register_ftrace_graph(trace_func_graph_ret_t retfunc,
trace_func_graph_ent_t entryfunc)
{
@@ -2050,6 +2065,9 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
mutex_lock(&ftrace_sysctl_lock);
+ ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
+ register_pm_notifier(&ftrace_suspend_notifier);
+
atomic_inc(&ftrace_graph_active);
ret = start_graph_tracing();
if (ret) {
@@ -2075,6 +2093,7 @@ void unregister_ftrace_graph(void)
ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
ftrace_shutdown(FTRACE_STOP_FUNC_RET);
+ unregister_pm_notifier(&ftrace_suspend_notifier);
mutex_unlock(&ftrace_sysctl_lock);
}
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
new file mode 100644
index 000000000000..7ebc58cee3bd
--- /dev/null
+++ b/kernel/trace/kmemtrace.c
@@ -0,0 +1,350 @@
+/*
+ * Memory allocator tracing
+ *
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+#include <linux/dcache.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <trace/kmemtrace.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+/* Select an alternative, minimalistic output than the original one */
+#define TRACE_KMEM_OPT_MINIMAL 0x1
+
+static struct tracer_opt kmem_opts[] = {
+ /* Default disable the minimalistic output */
+ { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
+ { }
+};
+
+static struct tracer_flags kmem_tracer_flags = {
+ .val = 0,
+ .opts = kmem_opts
+};
+
+
+static bool kmem_tracing_enabled __read_mostly;
+static struct trace_array *kmemtrace_array;
+
+static int kmem_trace_init(struct trace_array *tr)
+{
+ int cpu;
+ kmemtrace_array = tr;
+
+ for_each_cpu_mask(cpu, cpu_possible_map)
+ tracing_reset(tr, cpu);
+
+ kmem_tracing_enabled = true;
+
+ return 0;
+}
+
+static void kmem_trace_reset(struct trace_array *tr)
+{
+ kmem_tracing_enabled = false;
+}
+
+static void kmemtrace_headers(struct seq_file *s)
+{
+ /* Don't need headers for the original kmemtrace output */
+ if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
+ return;
+
+ seq_printf(s, "#\n");
+ seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
+ " POINTER NODE CALLER\n");
+ seq_printf(s, "# FREE | | | | "
+ " | | | |\n");
+ seq_printf(s, "# |\n\n");
+}
+
+/*
+ * The two following functions give the original output from kmemtrace,
+ * or something close to....perhaps they need some missing things
+ */
+static enum print_line_t
+kmemtrace_print_alloc_original(struct trace_iterator *iter,
+ struct kmemtrace_alloc_entry *entry)
+{
+ struct trace_seq *s = &iter->seq;
+ int ret;
+
+ /* Taken from the old linux/kmemtrace.h */
+ ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu "
+ "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
+ entry->type_id, entry->call_site, (unsigned long) entry->ptr,
+ (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc,
+ (unsigned long) entry->gfp_flags, entry->node);
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free_original(struct trace_iterator *iter,
+ struct kmemtrace_free_entry *entry)
+{
+ struct trace_seq *s = &iter->seq;
+ int ret;
+
+ /* Taken from the old linux/kmemtrace.h */
+ ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n",
+ entry->type_id, entry->call_site, (unsigned long) entry->ptr);
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+
+/* The two other following provide a more minimalistic output */
+static enum print_line_t
+kmemtrace_print_alloc_compress(struct trace_iterator *iter,
+ struct kmemtrace_alloc_entry *entry)
+{
+ struct trace_seq *s = &iter->seq;
+ int ret;
+
+ /* Alloc entry */
+ ret = trace_seq_printf(s, " + ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Type */
+ switch (entry->type_id) {
+ case KMEMTRACE_TYPE_KMALLOC:
+ ret = trace_seq_printf(s, "K ");
+ break;
+ case KMEMTRACE_TYPE_CACHE:
+ ret = trace_seq_printf(s, "C ");
+ break;
+ case KMEMTRACE_TYPE_PAGES:
+ ret = trace_seq_printf(s, "P ");
+ break;
+ default:
+ ret = trace_seq_printf(s, "? ");
+ }
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Requested */
+ ret = trace_seq_printf(s, "%4ld ", entry->bytes_req);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Allocated */
+ ret = trace_seq_printf(s, "%4ld ", entry->bytes_alloc);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Flags
+ * TODO: would be better to see the name of the GFP flag names
+ */
+ ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Pointer to allocated */
+ ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Node */
+ ret = trace_seq_printf(s, "%4d ", entry->node);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Call site */
+ ret = seq_print_ip_sym(s, entry->call_site, 0);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (!trace_seq_printf(s, "\n"))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free_compress(struct trace_iterator *iter,
+ struct kmemtrace_free_entry *entry)
+{
+ struct trace_seq *s = &iter->seq;
+ int ret;
+
+ /* Free entry */
+ ret = trace_seq_printf(s, " - ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Type */
+ switch (entry->type_id) {
+ case KMEMTRACE_TYPE_KMALLOC:
+ ret = trace_seq_printf(s, "K ");
+ break;
+ case KMEMTRACE_TYPE_CACHE:
+ ret = trace_seq_printf(s, "C ");
+ break;
+ case KMEMTRACE_TYPE_PAGES:
+ ret = trace_seq_printf(s, "P ");
+ break;
+ default:
+ ret = trace_seq_printf(s, "? ");
+ }
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Skip requested/allocated/flags */
+ ret = trace_seq_printf(s, " ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Pointer to allocated */
+ ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Skip node */
+ ret = trace_seq_printf(s, " ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Call site */
+ ret = seq_print_ip_sym(s, entry->call_site, 0);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (!trace_seq_printf(s, "\n"))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+
+ switch (entry->type) {
+ case TRACE_KMEM_ALLOC: {
+ struct kmemtrace_alloc_entry *field;
+ trace_assign_type(field, entry);
+ if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
+ return kmemtrace_print_alloc_compress(iter, field);
+ else
+ return kmemtrace_print_alloc_original(iter, field);
+ }
+
+ case TRACE_KMEM_FREE: {
+ struct kmemtrace_free_entry *field;
+ trace_assign_type(field, entry);
+ if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
+ return kmemtrace_print_free_compress(iter, field);
+ else
+ return kmemtrace_print_free_original(iter, field);
+ }
+
+ default:
+ return TRACE_TYPE_UNHANDLED;
+ }
+}
+
+/* Trace allocations */
+void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+ unsigned long call_site,
+ const void *ptr,
+ size_t bytes_req,
+ size_t bytes_alloc,
+ gfp_t gfp_flags,
+ int node)
+{
+ struct ring_buffer_event *event;
+ struct kmemtrace_alloc_entry *entry;
+ struct trace_array *tr = kmemtrace_array;
+ unsigned long irq_flags;
+
+ if (!kmem_tracing_enabled)
+ return;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, 0);
+
+ entry->ent.type = TRACE_KMEM_ALLOC;
+ entry->call_site = call_site;
+ entry->ptr = ptr;
+ entry->bytes_req = bytes_req;
+ entry->bytes_alloc = bytes_alloc;
+ entry->gfp_flags = gfp_flags;
+ entry->node = node;
+
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ trace_wake_up();
+}
+EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
+
+void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+ unsigned long call_site,
+ const void *ptr)
+{
+ struct ring_buffer_event *event;
+ struct kmemtrace_free_entry *entry;
+ struct trace_array *tr = kmemtrace_array;
+ unsigned long irq_flags;
+
+ if (!kmem_tracing_enabled)
+ return;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, 0);
+
+ entry->ent.type = TRACE_KMEM_FREE;
+ entry->type_id = type_id;
+ entry->call_site = call_site;
+ entry->ptr = ptr;
+
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ trace_wake_up();
+}
+EXPORT_SYMBOL(kmemtrace_mark_free);
+
+static struct tracer kmem_tracer __read_mostly = {
+ .name = "kmemtrace",
+ .init = kmem_trace_init,
+ .reset = kmem_trace_reset,
+ .print_line = kmemtrace_print_line,
+ .print_header = kmemtrace_headers,
+ .flags = &kmem_tracer_flags
+};
+
+void kmemtrace_init(void)
+{
+ /* earliest opportunity to start kmem tracing */
+}
+
+static int __init init_kmem_tracer(void)
+{
+ return register_tracer(&kmem_tracer);
+}
+
+device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8b0daf0662ef..b36d7374ceef 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -123,8 +123,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
-#define RB_ALIGNMENT_SHIFT 2
-#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
+#define RB_ALIGNMENT 4U
#define RB_MAX_SMALL_DATA 28
enum {
@@ -133,7 +132,7 @@ enum {
};
/* inline for ring buffer fast paths */
-static inline unsigned
+static unsigned
rb_event_length(struct ring_buffer_event *event)
{
unsigned length;
@@ -151,7 +150,7 @@ rb_event_length(struct ring_buffer_event *event)
case RINGBUF_TYPE_DATA:
if (event->len)
- length = event->len << RB_ALIGNMENT_SHIFT;
+ length = event->len * RB_ALIGNMENT;
else
length = event->array[0];
return length + RB_EVNT_HDR_SIZE;
@@ -179,7 +178,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
EXPORT_SYMBOL_GPL(ring_buffer_event_length);
/* inline for ring buffer fast paths */
-static inline void *
+static void *
rb_event_data(struct ring_buffer_event *event)
{
BUG_ON(event->type != RINGBUF_TYPE_DATA);
@@ -229,10 +228,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
* Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
* this issue out.
*/
-static inline void free_buffer_page(struct buffer_page *bpage)
+static void free_buffer_page(struct buffer_page *bpage)
{
- if (bpage->page)
- free_page((unsigned long)bpage->page);
+ free_page((unsigned long)bpage->page);
kfree(bpage);
}
@@ -246,7 +244,7 @@ static inline int test_time_stamp(u64 delta)
return 0;
}
-#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page))
+#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data))
/*
* head_page == tail_page && head == tail then buffer is empty.
@@ -811,7 +809,7 @@ rb_event_index(struct ring_buffer_event *event)
return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
}
-static inline int
+static int
rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
@@ -825,7 +823,7 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
rb_commit_index(cpu_buffer) == index;
}
-static inline void
+static void
rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
@@ -850,7 +848,7 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
local_set(&cpu_buffer->commit_page->page->commit, index);
}
-static inline void
+static void
rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
{
/*
@@ -896,7 +894,7 @@ static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->reader_page->read = 0;
}
-static inline void rb_inc_iter(struct ring_buffer_iter *iter)
+static void rb_inc_iter(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
@@ -926,7 +924,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
* and with this, we can determine what to place into the
* data field.
*/
-static inline void
+static void
rb_update_event(struct ring_buffer_event *event,
unsigned type, unsigned length)
{
@@ -938,15 +936,11 @@ rb_update_event(struct ring_buffer_event *event,
break;
case RINGBUF_TYPE_TIME_EXTEND:
- event->len =
- (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1))
- >> RB_ALIGNMENT_SHIFT;
+ event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
break;
case RINGBUF_TYPE_TIME_STAMP:
- event->len =
- (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
- >> RB_ALIGNMENT_SHIFT;
+ event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
break;
case RINGBUF_TYPE_DATA:
@@ -955,16 +949,14 @@ rb_update_event(struct ring_buffer_event *event,
event->len = 0;
event->array[0] = length;
} else
- event->len =
- (length + (RB_ALIGNMENT-1))
- >> RB_ALIGNMENT_SHIFT;
+ event->len = DIV_ROUND_UP(length, RB_ALIGNMENT);
break;
default:
BUG();
}
}
-static inline unsigned rb_calculate_event_length(unsigned length)
+static unsigned rb_calculate_event_length(unsigned length)
{
struct ring_buffer_event event; /* Used only for sizeof array */
@@ -1025,12 +1017,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
}
if (next_page == head_page) {
- if (!(buffer->flags & RB_FL_OVERWRITE)) {
- /* reset write */
- if (tail <= BUF_PAGE_SIZE)
- local_set(&tail_page->write, tail);
+ if (!(buffer->flags & RB_FL_OVERWRITE))
goto out_unlock;
- }
/* tail_page has not moved yet? */
if (tail_page == cpu_buffer->tail_page) {
@@ -1105,6 +1093,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
return event;
out_unlock:
+ /* reset write */
+ if (tail <= BUF_PAGE_SIZE)
+ local_set(&tail_page->write, tail);
+
__raw_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
return NULL;
@@ -1438,7 +1430,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_write);
-static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = cpu_buffer->reader_page;
struct buffer_page *head = cpu_buffer->head_page;
@@ -2174,6 +2166,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->overrun = 0;
cpu_buffer->entries = 0;
+
+ cpu_buffer->write_stamp = 0;
+ cpu_buffer->read_stamp = 0;
}
/**
@@ -2274,9 +2269,24 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
if (buffer_a->pages != buffer_b->pages)
return -EINVAL;
+ if (ring_buffer_flags != RB_BUFFERS_ON)
+ return -EAGAIN;
+
+ if (atomic_read(&buffer_a->record_disabled))
+ return -EAGAIN;
+
+ if (atomic_read(&buffer_b->record_disabled))
+ return -EAGAIN;
+
cpu_buffer_a = buffer_a->buffers[cpu];
cpu_buffer_b = buffer_b->buffers[cpu];
+ if (atomic_read(&cpu_buffer_a->record_disabled))
+ return -EAGAIN;
+
+ if (atomic_read(&cpu_buffer_b->record_disabled))
+ return -EAGAIN;
+
/*
* We can't do a synchronize_sched here because this
* function can be called in atomic context.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c580233add95..757ae6f7e648 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -37,10 +37,11 @@
#include <linux/irqflags.h>
#include "trace.h"
+#include "trace_output.h"
#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
-unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
+unsigned long __read_mostly tracing_max_latency;
unsigned long __read_mostly tracing_thresh;
/*
@@ -186,9 +187,6 @@ int tracing_is_enabled(void)
return tracer_enabled;
}
-/* function tracing enabled */
-int ftrace_function_enabled;
-
/*
* trace_buf_size is the size in bytes that is allocated
* for a buffer. Note, the number of bytes is always rounded
@@ -329,132 +327,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
tracing_record_cmdline(current);
}
-/**
- * trace_seq_printf - sequence printing of trace information
- * @s: trace sequence descriptor
- * @fmt: printf format string
- *
- * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
- * trace_seq_printf is used to store strings into a special
- * buffer (@s). Then the output may be either used by
- * the sequencer or pulled into another buffer.
- */
-int
-trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- va_list ap;
- int ret;
-
- if (!len)
- return 0;
-
- va_start(ap, fmt);
- ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
- va_end(ap);
-
- /* If we can't write it all, don't bother writing anything */
- if (ret >= len)
- return 0;
-
- s->len += ret;
-
- return len;
-}
-
-/**
- * trace_seq_puts - trace sequence printing of simple string
- * @s: trace sequence descriptor
- * @str: simple string to record
- *
- * The tracer may use either the sequence operations or its own
- * copy to user routines. This function records a simple string
- * into a special buffer (@s) for later retrieval by a sequencer
- * or other mechanism.
- */
-static int
-trace_seq_puts(struct trace_seq *s, const char *str)
-{
- int len = strlen(str);
-
- if (len > ((PAGE_SIZE - 1) - s->len))
- return 0;
-
- memcpy(s->buffer + s->len, str, len);
- s->len += len;
-
- return len;
-}
-
-static int
-trace_seq_putc(struct trace_seq *s, unsigned char c)
-{
- if (s->len >= (PAGE_SIZE - 1))
- return 0;
-
- s->buffer[s->len++] = c;
-
- return 1;
-}
-
-static int
-trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
-{
- if (len > ((PAGE_SIZE - 1) - s->len))
- return 0;
-
- memcpy(s->buffer + s->len, mem, len);
- s->len += len;
-
- return len;
-}
-
-#define MAX_MEMHEX_BYTES 8
-#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
-
-static int
-trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
-{
- unsigned char hex[HEX_CHARS];
- unsigned char *data = mem;
- int i, j;
-
-#ifdef __BIG_ENDIAN
- for (i = 0, j = 0; i < len; i++) {
-#else
- for (i = len-1, j = 0; i >= 0; i--) {
-#endif
- hex[j++] = hex_asc_hi(data[i]);
- hex[j++] = hex_asc_lo(data[i]);
- }
- hex[j++] = ' ';
-
- return trace_seq_putmem(s, hex, j);
-}
-
-static int
-trace_seq_path(struct trace_seq *s, struct path *path)
-{
- unsigned char *p;
-
- if (s->len >= (PAGE_SIZE - 1))
- return 0;
- p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
- if (!IS_ERR(p)) {
- p = mangle_path(s->buffer + s->len, p, "\n");
- if (p) {
- s->len = p - s->buffer;
- return 1;
- }
- } else {
- s->buffer[s->len++] = '?';
- return 1;
- }
-
- return 0;
-}
-
static void
trace_seq_reset(struct trace_seq *s)
{
@@ -543,7 +415,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
ftrace_enable_cpu();
- WARN_ON_ONCE(ret);
+ WARN_ON_ONCE(ret && ret != -EAGAIN);
__update_max_tr(tr, tsk, cpu);
__raw_spin_unlock(&ftrace_max_lock);
@@ -960,10 +832,10 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
trace_function(tr, data, ip, parent_ip, flags, pc);
}
-static void ftrace_trace_stack(struct trace_array *tr,
- struct trace_array_cpu *data,
- unsigned long flags,
- int skip, int pc)
+static void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ unsigned long flags,
+ int skip, int pc)
{
#ifdef CONFIG_STACKTRACE
struct ring_buffer_event *event;
@@ -971,9 +843,6 @@ static void ftrace_trace_stack(struct trace_array *tr,
struct stack_trace trace;
unsigned long irq_flags;
- if (!(trace_flags & TRACE_ITER_STACKTRACE))
- return;
-
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
if (!event)
@@ -994,12 +863,23 @@ static void ftrace_trace_stack(struct trace_array *tr,
#endif
}
+static void ftrace_trace_stack(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ unsigned long flags,
+ int skip, int pc)
+{
+ if (!(trace_flags & TRACE_ITER_STACKTRACE))
+ return;
+
+ __ftrace_trace_stack(tr, data, flags, skip, pc);
+}
+
void __trace_stack(struct trace_array *tr,
struct trace_array_cpu *data,
unsigned long flags,
- int skip)
+ int skip, int pc)
{
- ftrace_trace_stack(tr, data, flags, skip, preempt_count());
+ __ftrace_trace_stack(tr, data, flags, skip, pc);
}
static void ftrace_trace_userstack(struct trace_array *tr,
@@ -1163,65 +1043,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
local_irq_restore(flags);
}
-#ifdef CONFIG_FUNCTION_TRACER
-static void
-function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
-{
- struct trace_array *tr = &global_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
- long disabled;
- int cpu, resched;
- int pc;
-
- if (unlikely(!ftrace_function_enabled))
- return;
-
- pc = preempt_count();
- resched = ftrace_preempt_disable();
- local_save_flags(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
-
- if (likely(disabled == 1))
- trace_function(tr, data, ip, parent_ip, flags, pc);
-
- atomic_dec(&data->disabled);
- ftrace_preempt_enable(resched);
-}
-
-static void
-function_trace_call(unsigned long ip, unsigned long parent_ip)
-{
- struct trace_array *tr = &global_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
- long disabled;
- int cpu;
- int pc;
-
- if (unlikely(!ftrace_function_enabled))
- return;
-
- /*
- * Need to use raw, since this must be called before the
- * recursive protection is performed.
- */
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
-
- if (likely(disabled == 1)) {
- pc = preempt_count();
- trace_function(tr, data, ip, parent_ip, flags, pc);
- }
-
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
-}
-
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
int trace_graph_entry(struct ftrace_graph_ent *trace)
{
@@ -1279,31 +1100,6 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-static struct ftrace_ops trace_ops __read_mostly =
-{
- .func = function_trace_call,
-};
-
-void tracing_start_function_trace(void)
-{
- ftrace_function_enabled = 0;
-
- if (trace_flags & TRACE_ITER_PREEMPTONLY)
- trace_ops.func = function_trace_call_preempt_only;
- else
- trace_ops.func = function_trace_call;
-
- register_ftrace_function(&trace_ops);
- ftrace_function_enabled = 1;
-}
-
-void tracing_stop_function_trace(void)
-{
- ftrace_function_enabled = 0;
- unregister_ftrace_function(&trace_ops);
-}
-#endif
-
enum trace_file_type {
TRACE_FILE_LAT_FMT = 1,
TRACE_FILE_ANNOTATE = 2,
@@ -1472,154 +1268,6 @@ static void s_stop(struct seq_file *m, void *p)
mutex_unlock(&trace_types_lock);
}
-#ifdef CONFIG_KRETPROBES
-static inline const char *kretprobed(const char *name)
-{
- static const char tramp_name[] = "kretprobe_trampoline";
- int size = sizeof(tramp_name);
-
- if (strncmp(tramp_name, name, size) == 0)
- return "[unknown/kretprobe'd]";
- return name;
-}
-#else
-static inline const char *kretprobed(const char *name)
-{
- return name;
-}
-#endif /* CONFIG_KRETPROBES */
-
-static int
-seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
-{
-#ifdef CONFIG_KALLSYMS
- char str[KSYM_SYMBOL_LEN];
- const char *name;
-
- kallsyms_lookup(address, NULL, NULL, NULL, str);
-
- name = kretprobed(str);
-
- return trace_seq_printf(s, fmt, name);
-#endif
- return 1;
-}
-
-static int
-seq_print_sym_offset(struct trace_seq *s, const char *fmt,
- unsigned long address)
-{
-#ifdef CONFIG_KALLSYMS
- char str[KSYM_SYMBOL_LEN];
- const char *name;
-
- sprint_symbol(str, address);
- name = kretprobed(str);
-
- return trace_seq_printf(s, fmt, name);
-#endif
- return 1;
-}
-
-#ifndef CONFIG_64BIT
-# define IP_FMT "%08lx"
-#else
-# define IP_FMT "%016lx"
-#endif
-
-int
-seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
-{
- int ret;
-
- if (!ip)
- return trace_seq_printf(s, "0");
-
- if (sym_flags & TRACE_ITER_SYM_OFFSET)
- ret = seq_print_sym_offset(s, "%s", ip);
- else
- ret = seq_print_sym_short(s, "%s", ip);
-
- if (!ret)
- return 0;
-
- if (sym_flags & TRACE_ITER_SYM_ADDR)
- ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
- return ret;
-}
-
-static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
- unsigned long ip, unsigned long sym_flags)
-{
- struct file *file = NULL;
- unsigned long vmstart = 0;
- int ret = 1;
-
- if (mm) {
- const struct vm_area_struct *vma;
-
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, ip);
- if (vma) {
- file = vma->vm_file;
- vmstart = vma->vm_start;
- }
- if (file) {
- ret = trace_seq_path(s, &file->f_path);
- if (ret)
- ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
- }
- up_read(&mm->mmap_sem);
- }
- if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
- ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
- return ret;
-}
-
-static int
-seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
- unsigned long sym_flags)
-{
- struct mm_struct *mm = NULL;
- int ret = 1;
- unsigned int i;
-
- if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
- struct task_struct *task;
- /*
- * we do the lookup on the thread group leader,
- * since individual threads might have already quit!
- */
- rcu_read_lock();
- task = find_task_by_vpid(entry->ent.tgid);
- if (task)
- mm = get_task_mm(task);
- rcu_read_unlock();
- }
-
- for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
- unsigned long ip = entry->caller[i];
-
- if (ip == ULONG_MAX || !ret)
- break;
- if (i && ret)
- ret = trace_seq_puts(s, " <- ");
- if (!ip) {
- if (ret)
- ret = trace_seq_puts(s, "??");
- continue;
- }
- if (!ret)
- break;
- if (ret)
- ret = seq_print_user_ip(s, mm, ip, sym_flags);
- }
-
- if (mm)
- mmput(mm);
- return ret;
-}
-
static void print_lat_help_header(struct seq_file *m)
{
seq_puts(m, "# _------=> CPU# \n");
@@ -1755,52 +1403,6 @@ lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
trace_seq_puts(s, " : ");
}
-static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
-
-static int task_state_char(unsigned long state)
-{
- int bit = state ? __ffs(state) + 1 : 0;
-
- return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
-}
-
-/*
- * The message is supposed to contain an ending newline.
- * If the printing stops prematurely, try to add a newline of our own.
- */
-void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
-{
- struct trace_entry *ent;
- struct trace_field_cont *cont;
- bool ok = true;
-
- ent = peek_next_entry(iter, iter->cpu, NULL);
- if (!ent || ent->type != TRACE_CONT) {
- trace_seq_putc(s, '\n');
- return;
- }
-
- do {
- cont = (struct trace_field_cont *)ent;
- if (ok)
- ok = (trace_seq_printf(s, "%s", cont->buf) > 0);
-
- ftrace_disable_cpu();
-
- if (iter->buffer_iter[iter->cpu])
- ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
- else
- ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
-
- ftrace_enable_cpu();
-
- ent = peek_next_entry(iter, iter->cpu, NULL);
- } while (ent && ent->type == TRACE_CONT);
-
- if (!ok)
- trace_seq_putc(s, '\n');
-}
-
static void test_cpu_buff_start(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
@@ -1824,17 +1426,14 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
struct trace_seq *s = &iter->seq;
unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
struct trace_entry *next_entry;
+ struct trace_event *event;
unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
struct trace_entry *entry = iter->ent;
unsigned long abs_usecs;
unsigned long rel_usecs;
u64 next_ts;
char *comm;
- int S, T;
- int i;
-
- if (entry->type == TRACE_CONT)
- return TRACE_TYPE_HANDLED;
+ int ret;
test_cpu_buff_start(iter);
@@ -1859,96 +1458,16 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
lat_print_generic(s, entry, cpu);
lat_print_timestamp(s, abs_usecs, rel_usecs);
}
- switch (entry->type) {
- case TRACE_FN: {
- struct ftrace_entry *field;
- trace_assign_type(field, entry);
-
- seq_print_ip_sym(s, field->ip, sym_flags);
- trace_seq_puts(s, " (");
- seq_print_ip_sym(s, field->parent_ip, sym_flags);
- trace_seq_puts(s, ")\n");
- break;
- }
- case TRACE_CTX:
- case TRACE_WAKE: {
- struct ctx_switch_entry *field;
-
- trace_assign_type(field, entry);
-
- T = task_state_char(field->next_state);
- S = task_state_char(field->prev_state);
- comm = trace_find_cmdline(field->next_pid);
- trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
- field->prev_pid,
- field->prev_prio,
- S, entry->type == TRACE_CTX ? "==>" : " +",
- field->next_cpu,
- field->next_pid,
- field->next_prio,
- T, comm);
- break;
- }
- case TRACE_SPECIAL: {
- struct special_entry *field;
-
- trace_assign_type(field, entry);
-
- trace_seq_printf(s, "# %ld %ld %ld\n",
- field->arg1,
- field->arg2,
- field->arg3);
- break;
- }
- case TRACE_STACK: {
- struct stack_entry *field;
-
- trace_assign_type(field, entry);
-
- for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
- if (i)
- trace_seq_puts(s, " <= ");
- seq_print_ip_sym(s, field->caller[i], sym_flags);
- }
- trace_seq_puts(s, "\n");
- break;
- }
- case TRACE_PRINT: {
- struct print_entry *field;
-
- trace_assign_type(field, entry);
-
- seq_print_ip_sym(s, field->ip, sym_flags);
- trace_seq_printf(s, ": %s", field->buf);
- if (entry->flags & TRACE_FLAG_CONT)
- trace_seq_print_cont(s, iter);
- break;
- }
- case TRACE_BRANCH: {
- struct trace_branch *field;
-
- trace_assign_type(field, entry);
-
- trace_seq_printf(s, "[%s] %s:%s:%d\n",
- field->correct ? " ok " : " MISS ",
- field->func,
- field->file,
- field->line);
- break;
+ event = ftrace_find_event(entry->type);
+ if (event && event->latency_trace) {
+ ret = event->latency_trace(s, entry, sym_flags);
+ if (ret)
+ return ret;
+ return TRACE_TYPE_HANDLED;
}
- case TRACE_USER_STACK: {
- struct userstack_entry *field;
- trace_assign_type(field, entry);
-
- seq_print_userip_objs(field, s, sym_flags);
- trace_seq_putc(s, '\n');
- break;
- }
- default:
- trace_seq_printf(s, "Unknown type %d\n", entry->type);
- }
+ trace_seq_printf(s, "Unknown type %d\n", entry->type);
return TRACE_TYPE_HANDLED;
}
@@ -1957,19 +1476,15 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
struct trace_seq *s = &iter->seq;
unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
struct trace_entry *entry;
+ struct trace_event *event;
unsigned long usec_rem;
unsigned long long t;
unsigned long secs;
char *comm;
int ret;
- int S, T;
- int i;
entry = iter->ent;
- if (entry->type == TRACE_CONT)
- return TRACE_TYPE_HANDLED;
-
test_cpu_buff_start(iter);
comm = trace_find_cmdline(iter->ent->pid);
@@ -1988,129 +1503,17 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- switch (entry->type) {
- case TRACE_FN: {
- struct ftrace_entry *field;
-
- trace_assign_type(field, entry);
-
- ret = seq_print_ip_sym(s, field->ip, sym_flags);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
- field->parent_ip) {
- ret = trace_seq_printf(s, " <-");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- ret = seq_print_ip_sym(s,
- field->parent_ip,
- sym_flags);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- ret = trace_seq_printf(s, "\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
- }
- case TRACE_CTX:
- case TRACE_WAKE: {
- struct ctx_switch_entry *field;
-
- trace_assign_type(field, entry);
-
- T = task_state_char(field->next_state);
- S = task_state_char(field->prev_state);
- ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
- field->prev_pid,
- field->prev_prio,
- S,
- entry->type == TRACE_CTX ? "==>" : " +",
- field->next_cpu,
- field->next_pid,
- field->next_prio,
- T);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
- }
- case TRACE_SPECIAL: {
- struct special_entry *field;
-
- trace_assign_type(field, entry);
-
- ret = trace_seq_printf(s, "# %ld %ld %ld\n",
- field->arg1,
- field->arg2,
- field->arg3);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
- }
- case TRACE_STACK: {
- struct stack_entry *field;
-
- trace_assign_type(field, entry);
-
- for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
- if (i) {
- ret = trace_seq_puts(s, " <= ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- ret = seq_print_ip_sym(s, field->caller[i],
- sym_flags);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- ret = trace_seq_puts(s, "\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
- }
- case TRACE_PRINT: {
- struct print_entry *field;
-
- trace_assign_type(field, entry);
-
- seq_print_ip_sym(s, field->ip, sym_flags);
- trace_seq_printf(s, ": %s", field->buf);
- if (entry->flags & TRACE_FLAG_CONT)
- trace_seq_print_cont(s, iter);
- break;
- }
- case TRACE_GRAPH_RET: {
- return print_graph_function(iter);
- }
- case TRACE_GRAPH_ENT: {
- return print_graph_function(iter);
- }
- case TRACE_BRANCH: {
- struct trace_branch *field;
-
- trace_assign_type(field, entry);
-
- trace_seq_printf(s, "[%s] %s:%s:%d\n",
- field->correct ? " ok " : " MISS ",
- field->func,
- field->file,
- field->line);
- break;
+ event = ftrace_find_event(entry->type);
+ if (event && event->trace) {
+ ret = event->trace(s, entry, sym_flags);
+ if (ret)
+ return ret;
+ return TRACE_TYPE_HANDLED;
}
- case TRACE_USER_STACK: {
- struct userstack_entry *field;
-
- trace_assign_type(field, entry);
+ ret = trace_seq_printf(s, "Unknown type %d\n", entry->type);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
- ret = seq_print_userip_objs(field, s, sym_flags);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- ret = trace_seq_putc(s, '\n');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
- }
- }
return TRACE_TYPE_HANDLED;
}
@@ -2118,152 +1521,47 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *entry;
+ struct trace_event *event;
int ret;
- int S, T;
entry = iter->ent;
- if (entry->type == TRACE_CONT)
- return TRACE_TYPE_HANDLED;
-
ret = trace_seq_printf(s, "%d %d %llu ",
entry->pid, iter->cpu, iter->ts);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- switch (entry->type) {
- case TRACE_FN: {
- struct ftrace_entry *field;
-
- trace_assign_type(field, entry);
-
- ret = trace_seq_printf(s, "%x %x\n",
- field->ip,
- field->parent_ip);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
- }
- case TRACE_CTX:
- case TRACE_WAKE: {
- struct ctx_switch_entry *field;
-
- trace_assign_type(field, entry);
-
- T = task_state_char(field->next_state);
- S = entry->type == TRACE_WAKE ? '+' :
- task_state_char(field->prev_state);
- ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
- field->prev_pid,
- field->prev_prio,
- S,
- field->next_cpu,
- field->next_pid,
- field->next_prio,
- T);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
- }
- case TRACE_SPECIAL:
- case TRACE_USER_STACK:
- case TRACE_STACK: {
- struct special_entry *field;
-
- trace_assign_type(field, entry);
-
- ret = trace_seq_printf(s, "# %ld %ld %ld\n",
- field->arg1,
- field->arg2,
- field->arg3);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
+ event = ftrace_find_event(entry->type);
+ if (event && event->raw) {
+ ret = event->raw(s, entry, 0);
+ if (ret)
+ return ret;
+ return TRACE_TYPE_HANDLED;
}
- case TRACE_PRINT: {
- struct print_entry *field;
-
- trace_assign_type(field, entry);
+ ret = trace_seq_printf(s, "%d ?\n", entry->type);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
- trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
- if (entry->flags & TRACE_FLAG_CONT)
- trace_seq_print_cont(s, iter);
- break;
- }
- }
return TRACE_TYPE_HANDLED;
}
-#define SEQ_PUT_FIELD_RET(s, x) \
-do { \
- if (!trace_seq_putmem(s, &(x), sizeof(x))) \
- return 0; \
-} while (0)
-
-#define SEQ_PUT_HEX_FIELD_RET(s, x) \
-do { \
- BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
- if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
- return 0; \
-} while (0)
-
static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
unsigned char newline = '\n';
struct trace_entry *entry;
- int S, T;
+ struct trace_event *event;
entry = iter->ent;
- if (entry->type == TRACE_CONT)
- return TRACE_TYPE_HANDLED;
-
SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
- switch (entry->type) {
- case TRACE_FN: {
- struct ftrace_entry *field;
-
- trace_assign_type(field, entry);
-
- SEQ_PUT_HEX_FIELD_RET(s, field->ip);
- SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
- break;
- }
- case TRACE_CTX:
- case TRACE_WAKE: {
- struct ctx_switch_entry *field;
-
- trace_assign_type(field, entry);
-
- T = task_state_char(field->next_state);
- S = entry->type == TRACE_WAKE ? '+' :
- task_state_char(field->prev_state);
- SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
- SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
- SEQ_PUT_HEX_FIELD_RET(s, S);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
- SEQ_PUT_HEX_FIELD_RET(s, T);
- break;
- }
- case TRACE_SPECIAL:
- case TRACE_USER_STACK:
- case TRACE_STACK: {
- struct special_entry *field;
-
- trace_assign_type(field, entry);
+ event = ftrace_find_event(entry->type);
+ if (event && event->hex)
+ event->hex(s, entry, 0);
- SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
- SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
- SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
- break;
- }
- }
SEQ_PUT_FIELD_RET(s, newline);
return TRACE_TYPE_HANDLED;
@@ -2282,9 +1580,6 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- if (entry->flags & TRACE_FLAG_CONT)
- trace_seq_print_cont(s, iter);
-
return TRACE_TYPE_HANDLED;
}
@@ -2292,53 +1587,19 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *entry;
+ struct trace_event *event;
entry = iter->ent;
- if (entry->type == TRACE_CONT)
- return TRACE_TYPE_HANDLED;
-
SEQ_PUT_FIELD_RET(s, entry->pid);
SEQ_PUT_FIELD_RET(s, entry->cpu);
SEQ_PUT_FIELD_RET(s, iter->ts);
- switch (entry->type) {
- case TRACE_FN: {
- struct ftrace_entry *field;
-
- trace_assign_type(field, entry);
-
- SEQ_PUT_FIELD_RET(s, field->ip);
- SEQ_PUT_FIELD_RET(s, field->parent_ip);
- break;
- }
- case TRACE_CTX: {
- struct ctx_switch_entry *field;
-
- trace_assign_type(field, entry);
+ event = ftrace_find_event(entry->type);
+ if (event && event->binary)
+ event->binary(s, entry, 0);
- SEQ_PUT_FIELD_RET(s, field->prev_pid);
- SEQ_PUT_FIELD_RET(s, field->prev_prio);
- SEQ_PUT_FIELD_RET(s, field->prev_state);
- SEQ_PUT_FIELD_RET(s, field->next_pid);
- SEQ_PUT_FIELD_RET(s, field->next_prio);
- SEQ_PUT_FIELD_RET(s, field->next_state);
- break;
- }
- case TRACE_SPECIAL:
- case TRACE_USER_STACK:
- case TRACE_STACK: {
- struct special_entry *field;
-
- trace_assign_type(field, entry);
-
- SEQ_PUT_FIELD_RET(s, field->arg1);
- SEQ_PUT_FIELD_RET(s, field->arg2);
- SEQ_PUT_FIELD_RET(s, field->arg3);
- break;
- }
- }
- return 1;
+ return TRACE_TYPE_HANDLED;
}
static int trace_empty(struct trace_iterator *iter)
@@ -3736,7 +2997,7 @@ static struct notifier_block trace_die_notifier = {
* it if we decide to change what log level the ftrace dump
* should be at.
*/
-#define KERN_TRACE KERN_INFO
+#define KERN_TRACE KERN_EMERG
static void
trace_printk_seq(struct trace_seq *s)
@@ -3770,6 +3031,7 @@ void ftrace_dump(void)
dump_ran = 1;
/* No turning back! */
+ tracing_off();
ftrace_kill();
for_each_tracing_cpu(cpu) {
@@ -3877,7 +3139,6 @@ __init static int tracer_alloc_buffers(void)
#else
current_trace = &nop_trace;
#endif
-
/* All seems OK, enable tracing */
tracing_disabled = 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4d3d381bfd95..b96037d970df 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,6 +9,7 @@
#include <linux/mmiotrace.h>
#include <linux/ftrace.h>
#include <trace/boot.h>
+#include <trace/kmemtrace.h>
enum trace_type {
__TRACE_FIRST_TYPE = 0,
@@ -16,7 +17,6 @@ enum trace_type {
TRACE_FN,
TRACE_CTX,
TRACE_WAKE,
- TRACE_CONT,
TRACE_STACK,
TRACE_PRINT,
TRACE_SPECIAL,
@@ -29,9 +29,11 @@ enum trace_type {
TRACE_GRAPH_ENT,
TRACE_USER_STACK,
TRACE_HW_BRANCHES,
+ TRACE_KMEM_ALLOC,
+ TRACE_KMEM_FREE,
TRACE_POWER,
- __TRACE_LAST_TYPE
+ __TRACE_LAST_TYPE,
};
/*
@@ -170,6 +172,24 @@ struct trace_power {
struct power_trace state_data;
};
+struct kmemtrace_alloc_entry {
+ struct trace_entry ent;
+ enum kmemtrace_type_id type_id;
+ unsigned long call_site;
+ const void *ptr;
+ size_t bytes_req;
+ size_t bytes_alloc;
+ gfp_t gfp_flags;
+ int node;
+};
+
+struct kmemtrace_free_entry {
+ struct trace_entry ent;
+ enum kmemtrace_type_id type_id;
+ unsigned long call_site;
+ const void *ptr;
+};
+
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
@@ -178,7 +198,6 @@ struct trace_power {
* NEED_RESCED - reschedule is requested
* HARDIRQ - inside an interrupt handler
* SOFTIRQ - inside a softirq handler
- * CONT - multiple entries hold the trace item
*/
enum trace_flag_type {
TRACE_FLAG_IRQS_OFF = 0x01,
@@ -186,7 +205,6 @@ enum trace_flag_type {
TRACE_FLAG_NEED_RESCHED = 0x04,
TRACE_FLAG_HARDIRQ = 0x08,
TRACE_FLAG_SOFTIRQ = 0x10,
- TRACE_FLAG_CONT = 0x20,
};
#define TRACE_BUF_SIZE 1024
@@ -262,7 +280,6 @@ extern void __ftrace_bad_type(void);
do { \
IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \
IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
- IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
@@ -280,6 +297,10 @@ extern void __ftrace_bad_type(void);
TRACE_GRAPH_RET); \
IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
+ IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
+ TRACE_KMEM_ALLOC); \
+ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
+ TRACE_KMEM_FREE); \
__ftrace_bad_type(); \
} while (0)
@@ -313,6 +334,7 @@ struct tracer_flags {
/* Makes more easy to define a tracer opt */
#define TRACER_OPT(s, b) .name = #s, .bit = b
+
/*
* A specific tracer, represented by methods that operate on a trace array:
*/
@@ -340,6 +362,7 @@ struct tracer {
struct tracer *next;
int print_max;
struct tracer_flags *flags;
+ struct tracer_stat *stats;
};
struct trace_seq {
@@ -415,7 +438,6 @@ void trace_function(struct trace_array *tr,
void trace_graph_return(struct ftrace_graph_ret *trace);
int trace_graph_entry(struct ftrace_graph_ent *trace);
-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
@@ -434,15 +456,12 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
-extern cycle_t ftrace_now(int cpu);
+void __trace_stack(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ unsigned long flags,
+ int skip, int pc);
-#ifdef CONFIG_FUNCTION_TRACER
-void tracing_start_function_trace(void);
-void tracing_stop_function_trace(void);
-#else
-# define tracing_start_function_trace() do { } while (0)
-# define tracing_stop_function_trace() do { } while (0)
-#endif
+extern cycle_t ftrace_now(int cpu);
#ifdef CONFIG_CONTEXT_SWITCH_TRACER
typedef void
@@ -456,10 +475,10 @@ struct tracer_switch_ops {
void *private;
struct tracer_switch_ops *next;
};
-
-char *trace_find_cmdline(int pid);
#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+extern char *trace_find_cmdline(int pid);
+
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
@@ -488,15 +507,6 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
-extern void trace_seq_print_cont(struct trace_seq *s,
- struct trace_iterator *iter);
-
-extern int
-seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
- unsigned long sym_flags);
-extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
- size_t cnt);
extern long ns2usecs(cycle_t nsec);
extern int
trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 366c8c333e13..0e94b3d091f7 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -11,6 +11,7 @@
#include <linux/kallsyms.h>
#include "trace.h"
+#include "trace_output.h"
static struct trace_array *boot_trace;
static bool pre_initcalls_finished;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 6c00feb3bac7..ca017e0a9a27 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -14,12 +14,17 @@
#include <linux/hash.h>
#include <linux/fs.h>
#include <asm/local.h>
+
#include "trace.h"
+#include "trace_stat.h"
+#include "trace_output.h"
#ifdef CONFIG_BRANCH_TRACER
+static struct tracer branch_trace;
static int branch_tracing_enabled __read_mostly;
static DEFINE_MUTEX(branch_tracing_mutex);
+
static struct trace_array *branch_tracer;
static void
@@ -142,22 +147,74 @@ static void branch_trace_reset(struct trace_array *tr)
stop_branch_trace(tr);
}
-struct tracer branch_trace __read_mostly =
+static int
+trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ struct print_entry *field;
+
+ trace_assign_type(field, entry);
+
+ if (seq_print_ip_sym(s, field->ip, flags))
+ goto partial;
+
+ if (trace_seq_printf(s, ": %s", field->buf))
+ goto partial;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_branch_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ struct trace_branch *field;
+
+ trace_assign_type(field, entry);
+
+ if (trace_seq_printf(s, "[%s] %s:%s:%d\n",
+ field->correct ? " ok " : " MISS ",
+ field->func,
+ field->file,
+ field->line))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return 0;
+}
+
+
+static struct trace_event trace_branch_event = {
+ .type = TRACE_BRANCH,
+ .trace = trace_branch_print,
+ .latency_trace = trace_branch_print,
+ .raw = trace_nop_print,
+ .hex = trace_nop_print,
+ .binary = trace_nop_print,
+};
+
+static struct tracer branch_trace __read_mostly =
{
.name = "branch",
.init = branch_trace_init,
.reset = branch_trace_reset,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_branch,
-#endif
+#endif /* CONFIG_FTRACE_SELFTEST */
};
-__init static int init_branch_trace(void)
+__init static int init_branch_tracer(void)
{
+ int ret;
+
+ ret = register_ftrace_event(&trace_branch_event);
+ if (!ret) {
+ printk(KERN_WARNING "Warning: could not register "
+ "branch events\n");
+ return 1;
+ }
return register_tracer(&branch_trace);
}
+device_initcall(init_branch_tracer);
-device_initcall(init_branch_trace);
#else
static inline
void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
@@ -183,66 +240,39 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
}
EXPORT_SYMBOL(ftrace_likely_update);
-struct ftrace_pointer {
- void *start;
- void *stop;
- int hit;
-};
+extern unsigned long __start_annotated_branch_profile[];
+extern unsigned long __stop_annotated_branch_profile[];
-static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
+static int annotated_branch_stat_headers(struct seq_file *m)
{
- const struct ftrace_pointer *f = m->private;
- struct ftrace_branch_data *p = v;
-
- (*pos)++;
-
- if (v == (void *)1)
- return f->start;
-
- ++p;
-
- if ((void *)p >= (void *)f->stop)
- return NULL;
-
- return p;
+ seq_printf(m, " correct incorrect %% ");
+ seq_printf(m, " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
+ return 0;
}
-static void *t_start(struct seq_file *m, loff_t *pos)
+static inline long get_incorrect_percent(struct ftrace_branch_data *p)
{
- void *t = (void *)1;
- loff_t l = 0;
-
- for (; t && l < *pos; t = t_next(m, t, &l))
- ;
+ long percent;
- return t;
-}
+ if (p->correct) {
+ percent = p->incorrect * 100;
+ percent /= p->correct + p->incorrect;
+ } else
+ percent = p->incorrect ? 100 : -1;
-static void t_stop(struct seq_file *m, void *p)
-{
+ return percent;
}
-static int t_show(struct seq_file *m, void *v)
+static int branch_stat_show(struct seq_file *m, void *v)
{
- const struct ftrace_pointer *fp = m->private;
struct ftrace_branch_data *p = v;
const char *f;
long percent;
- if (v == (void *)1) {
- if (fp->hit)
- seq_printf(m, " miss hit %% ");
- else
- seq_printf(m, " correct incorrect %% ");
- seq_printf(m, " Function "
- " File Line\n"
- " ------- --------- - "
- " -------- "
- " ---- ----\n");
- return 0;
- }
-
/* Only print the file, not the path */
f = p->file + strlen(p->file);
while (f >= p->file && *f != '/')
@@ -252,11 +282,7 @@ static int t_show(struct seq_file *m, void *v)
/*
* The miss is overlayed on correct, and hit on incorrect.
*/
- if (p->correct) {
- percent = p->incorrect * 100;
- percent /= p->correct + p->incorrect;
- } else
- percent = p->incorrect ? 100 : -1;
+ percent = get_incorrect_percent(p);
seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
if (percent < 0)
@@ -267,76 +293,118 @@ static int t_show(struct seq_file *m, void *v)
return 0;
}
-static struct seq_operations tracing_likely_seq_ops = {
- .start = t_start,
- .next = t_next,
- .stop = t_stop,
- .show = t_show,
+static void *annotated_branch_stat_start(void)
+{
+ return __start_annotated_branch_profile;
+}
+
+static void *
+annotated_branch_stat_next(void *v, int idx)
+{
+ struct ftrace_branch_data *p = v;
+
+ ++p;
+
+ if ((void *)p >= (void *)__stop_annotated_branch_profile)
+ return NULL;
+
+ return p;
+}
+
+static int annotated_branch_stat_cmp(void *p1, void *p2)
+{
+ struct ftrace_branch_data *a = p1;
+ struct ftrace_branch_data *b = p2;
+
+ long percent_a, percent_b;
+
+ percent_a = get_incorrect_percent(a);
+ percent_b = get_incorrect_percent(b);
+
+ if (percent_a < percent_b)
+ return -1;
+ if (percent_a > percent_b)
+ return 1;
+ else
+ return 0;
+}
+
+static struct tracer_stat annotated_branch_stats = {
+ .name = "branch_annotated",
+ .stat_start = annotated_branch_stat_start,
+ .stat_next = annotated_branch_stat_next,
+ .stat_cmp = annotated_branch_stat_cmp,
+ .stat_headers = annotated_branch_stat_headers,
+ .stat_show = branch_stat_show
};
-static int tracing_branch_open(struct inode *inode, struct file *file)
+__init static int init_annotated_branch_stats(void)
{
int ret;
- ret = seq_open(file, &tracing_likely_seq_ops);
+ ret = register_stat_tracer(&annotated_branch_stats);
if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = (void *)inode->i_private;
+ printk(KERN_WARNING "Warning: could not register "
+ "annotated branches stats\n");
+ return 1;
}
-
- return ret;
+ return 0;
}
-
-static const struct file_operations tracing_branch_fops = {
- .open = tracing_branch_open,
- .read = seq_read,
- .llseek = seq_lseek,
-};
+fs_initcall(init_annotated_branch_stats);
#ifdef CONFIG_PROFILE_ALL_BRANCHES
+
extern unsigned long __start_branch_profile[];
extern unsigned long __stop_branch_profile[];
-static const struct ftrace_pointer ftrace_branch_pos = {
- .start = __start_branch_profile,
- .stop = __stop_branch_profile,
- .hit = 1,
-};
+static int all_branch_stat_headers(struct seq_file *m)
+{
+ seq_printf(m, " miss hit %% ");
+ seq_printf(m, " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
+ return 0;
+}
-#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+static void *all_branch_stat_start(void)
+{
+ return __start_branch_profile;
+}
-extern unsigned long __start_annotated_branch_profile[];
-extern unsigned long __stop_annotated_branch_profile[];
+static void *
+all_branch_stat_next(void *v, int idx)
+{
+ struct ftrace_branch_data *p = v;
-static const struct ftrace_pointer ftrace_annotated_branch_pos = {
- .start = __start_annotated_branch_profile,
- .stop = __stop_annotated_branch_profile,
-};
+ ++p;
-static __init int ftrace_branch_init(void)
-{
- struct dentry *d_tracer;
- struct dentry *entry;
+ if ((void *)p >= (void *)__stop_branch_profile)
+ return NULL;
- d_tracer = tracing_init_dentry();
+ return p;
+}
- entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
- (void *)&ftrace_annotated_branch_pos,
- &tracing_branch_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'profile_annotatet_branch' entry\n");
+static struct tracer_stat all_branch_stats = {
+ .name = "branch_all",
+ .stat_start = all_branch_stat_start,
+ .stat_next = all_branch_stat_next,
+ .stat_headers = all_branch_stat_headers,
+ .stat_show = branch_stat_show
+};
-#ifdef CONFIG_PROFILE_ALL_BRANCHES
- entry = debugfs_create_file("profile_branch", 0444, d_tracer,
- (void *)&ftrace_branch_pos,
- &tracing_branch_fops);
- if (!entry)
- pr_warning("Could not create debugfs"
- " 'profile_branch' entry\n");
-#endif
+__init static int all_annotated_branch_stats(void)
+{
+ int ret;
+ ret = register_stat_tracer(&all_branch_stats);
+ if (!ret) {
+ printk(KERN_WARNING "Warning: could not register "
+ "all branches stats\n");
+ return 1;
+ }
return 0;
}
-
-device_initcall(ftrace_branch_init);
+fs_initcall(all_annotated_branch_stats);
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9236d7e25a16..b3a320f8aba7 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -16,8 +16,17 @@
#include "trace.h"
+/* function tracing enabled */
+static int ftrace_function_enabled;
+
+static struct trace_array *func_trace;
+
+static void tracing_start_function_trace(void);
+static void tracing_stop_function_trace(void);
+
static void start_function_trace(struct trace_array *tr)
{
+ func_trace = tr;
tr->cpu = get_cpu();
tracing_reset_online_cpus(tr);
put_cpu();
@@ -48,14 +57,188 @@ static void function_trace_start(struct trace_array *tr)
tracing_reset_online_cpus(tr);
}
+static void
+function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
+{
+ struct trace_array *tr = func_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu, resched;
+ int pc;
+
+ if (unlikely(!ftrace_function_enabled))
+ return;
+
+ pc = preempt_count();
+ resched = ftrace_preempt_disable();
+ local_save_flags(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+
+ if (likely(disabled == 1))
+ trace_function(tr, data, ip, parent_ip, flags, pc);
+
+ atomic_dec(&data->disabled);
+ ftrace_preempt_enable(resched);
+}
+
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct trace_array *tr = func_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ if (unlikely(!ftrace_function_enabled))
+ return;
+
+ /*
+ * Need to use raw, since this must be called before the
+ * recursive protection is performed.
+ */
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ trace_function(tr, data, ip, parent_ip, flags, pc);
+ }
+
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+}
+
+static void
+function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct trace_array *tr = func_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ if (unlikely(!ftrace_function_enabled))
+ return;
+
+ /*
+ * Need to use raw, since this must be called before the
+ * recursive protection is performed.
+ */
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ trace_function(tr, data, ip, parent_ip, flags, pc);
+ /*
+ * skip over 5 funcs:
+ * __ftrace_trace_stack,
+ * __trace_stack,
+ * function_stack_trace_call
+ * ftrace_list_func
+ * ftrace_call
+ */
+ __trace_stack(tr, data, flags, 5, pc);
+ }
+
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+}
+
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+ .func = function_trace_call,
+};
+
+static struct ftrace_ops trace_stack_ops __read_mostly =
+{
+ .func = function_stack_trace_call,
+};
+
+/* Our two options */
+enum {
+ TRACE_FUNC_OPT_STACK = 0x1,
+};
+
+static struct tracer_opt func_opts[] = {
+#ifdef CONFIG_STACKTRACE
+ { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
+#endif
+ { } /* Always set a last empty entry */
+};
+
+static struct tracer_flags func_flags = {
+ .val = 0, /* By default: all flags disabled */
+ .opts = func_opts
+};
+
+static void tracing_start_function_trace(void)
+{
+ ftrace_function_enabled = 0;
+
+ if (trace_flags & TRACE_ITER_PREEMPTONLY)
+ trace_ops.func = function_trace_call_preempt_only;
+ else
+ trace_ops.func = function_trace_call;
+
+ if (func_flags.val & TRACE_FUNC_OPT_STACK)
+ register_ftrace_function(&trace_stack_ops);
+ else
+ register_ftrace_function(&trace_ops);
+
+ ftrace_function_enabled = 1;
+}
+
+static void tracing_stop_function_trace(void)
+{
+ ftrace_function_enabled = 0;
+ /* OK if they are not registered */
+ unregister_ftrace_function(&trace_stack_ops);
+ unregister_ftrace_function(&trace_ops);
+}
+
+static int func_set_flag(u32 old_flags, u32 bit, int set)
+{
+ if (bit == TRACE_FUNC_OPT_STACK) {
+ /* do nothing if already set */
+ if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
+ return 0;
+
+ if (set) {
+ unregister_ftrace_function(&trace_ops);
+ register_ftrace_function(&trace_stack_ops);
+ } else {
+ unregister_ftrace_function(&trace_stack_ops);
+ register_ftrace_function(&trace_ops);
+ }
+
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
static struct tracer function_trace __read_mostly =
{
- .name = "function",
- .init = function_trace_init,
- .reset = function_trace_reset,
- .start = function_trace_start,
+ .name = "function",
+ .init = function_trace_init,
+ .reset = function_trace_reset,
+ .start = function_trace_start,
+ .flags = &func_flags,
+ .set_flag = func_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
- .selftest = trace_selftest_startup_function,
+ .selftest = trace_selftest_startup_function,
#endif
};
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 930c08e5b38e..3c545984816f 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -12,6 +12,7 @@
#include <linux/fs.h>
#include "trace.h"
+#include "trace_output.h"
#define TRACE_GRAPH_INDENT 2
@@ -589,8 +590,11 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- if (ent->flags & TRACE_FLAG_CONT)
- trace_seq_print_cont(s, iter);
+ /* Strip ending newline */
+ if (s->buffer[s->len - 1] == '\n') {
+ s->buffer[s->len - 1] = '\0';
+ s->len--;
+ }
ret = trace_seq_printf(s, " */\n");
if (!ret)
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 649df22d435f..fff3545fc866 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,7 +1,8 @@
/*
* h/w branch tracer for x86 based on bts
*
- * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
+ * Copyright (C) 2008-2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
*
*/
@@ -10,21 +11,44 @@
#include <linux/debugfs.h>
#include <linux/ftrace.h>
#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
#include <asm/ds.h>
#include "trace.h"
+#include "trace_output.h"
#define SIZEOF_BTS (1 << 13)
+/* The tracer mutex protects the below per-cpu tracer array.
+ It needs to be held to:
+ - start tracing on all cpus
+ - stop tracing on all cpus
+ - start tracing on a single hotplug cpu
+ - stop tracing on a single hotplug cpu
+ - read the trace from all cpus
+ - read the trace from a single cpu
+*/
+static DEFINE_MUTEX(bts_tracer_mutex);
static DEFINE_PER_CPU(struct bts_tracer *, tracer);
static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
#define this_tracer per_cpu(tracer, smp_processor_id())
#define this_buffer per_cpu(buffer, smp_processor_id())
+static int __read_mostly trace_hw_branches_enabled;
+static struct trace_array *hw_branch_trace __read_mostly;
+
+/*
+ * Start tracing on the current cpu.
+ * The argument is ignored.
+ *
+ * pre: bts_tracer_mutex must be locked.
+ */
static void bts_trace_start_cpu(void *arg)
{
if (this_tracer)
@@ -42,14 +66,20 @@ static void bts_trace_start_cpu(void *arg)
static void bts_trace_start(struct trace_array *tr)
{
- int cpu;
+ mutex_lock(&bts_tracer_mutex);
- tracing_reset_online_cpus(tr);
+ on_each_cpu(bts_trace_start_cpu, NULL, 1);
+ trace_hw_branches_enabled = 1;
- for_each_cpu(cpu, cpu_possible_mask)
- smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+ mutex_unlock(&bts_tracer_mutex);
}
+/*
+ * Start tracing on the current cpu.
+ * The argument is ignored.
+ *
+ * pre: bts_tracer_mutex must be locked.
+ */
static void bts_trace_stop_cpu(void *arg)
{
if (this_tracer) {
@@ -60,26 +90,63 @@ static void bts_trace_stop_cpu(void *arg)
static void bts_trace_stop(struct trace_array *tr)
{
- int cpu;
+ mutex_lock(&bts_tracer_mutex);
+
+ trace_hw_branches_enabled = 0;
+ on_each_cpu(bts_trace_stop_cpu, NULL, 1);
- for_each_cpu(cpu, cpu_possible_mask)
+ mutex_unlock(&bts_tracer_mutex);
+}
+
+static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ mutex_lock(&bts_tracer_mutex);
+
+ if (!trace_hw_branches_enabled)
+ goto out;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+ break;
+ case CPU_DOWN_PREPARE:
smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
+ break;
+ }
+
+ out:
+ mutex_unlock(&bts_tracer_mutex);
+ return NOTIFY_DONE;
}
+static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
+ .notifier_call = bts_hotcpu_handler
+};
+
static int bts_trace_init(struct trace_array *tr)
{
+ hw_branch_trace = tr;
+
+ register_hotcpu_notifier(&bts_hotcpu_notifier);
tracing_reset_online_cpus(tr);
bts_trace_start(tr);
return 0;
}
+static void bts_trace_reset(struct trace_array *tr)
+{
+ bts_trace_stop(tr);
+ unregister_hotcpu_notifier(&bts_hotcpu_notifier);
+}
+
static void bts_trace_print_header(struct seq_file *m)
{
- seq_puts(m,
- "# CPU# FROM TO FUNCTION\n");
- seq_puts(m,
- "# | | | |\n");
+ seq_puts(m, "# CPU# TO <- FROM\n");
}
static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
@@ -87,15 +154,15 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
struct trace_entry *entry = iter->ent;
struct trace_seq *seq = &iter->seq;
struct hw_branch_entry *it;
+ unsigned long symflags = TRACE_ITER_SYM_OFFSET;
trace_assign_type(it, entry);
if (entry->type == TRACE_HW_BRANCHES) {
if (trace_seq_printf(seq, "%4d ", entry->cpu) &&
- trace_seq_printf(seq, "0x%016llx -> 0x%016llx ",
- it->from, it->to) &&
- (!it->from ||
- seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
+ seq_print_ip_sym(seq, it->to, symflags) &&
+ trace_seq_printf(seq, "\t <- ") &&
+ seq_print_ip_sym(seq, it->from, symflags) &&
trace_seq_printf(seq, "\n"))
return TRACE_TYPE_HANDLED;
return TRACE_TYPE_PARTIAL_LINE;;
@@ -103,26 +170,42 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
return TRACE_TYPE_UNHANDLED;
}
-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
+void trace_hw_branch(u64 from, u64 to)
{
+ struct trace_array *tr = hw_branch_trace;
struct ring_buffer_event *event;
struct hw_branch_entry *entry;
- unsigned long irq;
+ unsigned long irq1, irq2;
+ int cpu;
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
- if (!event)
+ if (unlikely(!tr))
return;
+
+ if (unlikely(!trace_hw_branches_enabled))
+ return;
+
+ local_irq_save(irq1);
+ cpu = raw_smp_processor_id();
+ if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
+ goto out;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq2);
+ if (!event)
+ goto out;
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, from);
entry->ent.type = TRACE_HW_BRANCHES;
- entry->ent.cpu = smp_processor_id();
+ entry->ent.cpu = cpu;
entry->from = from;
entry->to = to;
- ring_buffer_unlock_commit(tr->buffer, event, irq);
+ ring_buffer_unlock_commit(tr->buffer, event, irq2);
+
+ out:
+ atomic_dec(&tr->data[cpu]->disabled);
+ local_irq_restore(irq1);
}
-static void trace_bts_at(struct trace_array *tr,
- const struct bts_trace *trace, void *at)
+static void trace_bts_at(const struct bts_trace *trace, void *at)
{
struct bts_struct bts;
int err = 0;
@@ -137,18 +220,29 @@ static void trace_bts_at(struct trace_array *tr,
switch (bts.qualifier) {
case BTS_BRANCH:
- trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to);
+ trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
break;
}
}
+/*
+ * Collect the trace on the current cpu and write it into the ftrace buffer.
+ *
+ * pre: bts_tracer_mutex must be locked
+ */
static void trace_bts_cpu(void *arg)
{
struct trace_array *tr = (struct trace_array *) arg;
const struct bts_trace *trace;
unsigned char *at;
- if (!this_tracer)
+ if (unlikely(!tr))
+ return;
+
+ if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
+ return;
+
+ if (unlikely(!this_tracer))
return;
ds_suspend_bts(this_tracer);
@@ -158,11 +252,11 @@ static void trace_bts_cpu(void *arg)
for (at = trace->ds.top; (void *)at < trace->ds.end;
at += trace->ds.size)
- trace_bts_at(tr, trace, at);
+ trace_bts_at(trace, at);
for (at = trace->ds.begin; (void *)at < trace->ds.top;
at += trace->ds.size)
- trace_bts_at(tr, trace, at);
+ trace_bts_at(trace, at);
out:
ds_resume_bts(this_tracer);
@@ -170,22 +264,38 @@ out:
static void trace_bts_prepare(struct trace_iterator *iter)
{
- int cpu;
+ mutex_lock(&bts_tracer_mutex);
+
+ on_each_cpu(trace_bts_cpu, iter->tr, 1);
+
+ mutex_unlock(&bts_tracer_mutex);
+}
+
+static void trace_bts_close(struct trace_iterator *iter)
+{
+ tracing_reset_online_cpus(iter->tr);
+}
+
+void trace_hw_branch_oops(void)
+{
+ mutex_lock(&bts_tracer_mutex);
+
+ trace_bts_cpu(hw_branch_trace);
- for_each_cpu(cpu, cpu_possible_mask)
- smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
+ mutex_unlock(&bts_tracer_mutex);
}
struct tracer bts_tracer __read_mostly =
{
.name = "hw-branch-tracer",
.init = bts_trace_init,
- .reset = bts_trace_stop,
+ .reset = bts_trace_reset,
.print_header = bts_trace_print_header,
.print_line = bts_trace_print_line,
.start = bts_trace_start,
.stop = bts_trace_stop,
- .open = trace_bts_prepare
+ .open = trace_bts_prepare,
+ .close = trace_bts_close
};
__init static int init_bts_trace(void)
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7c2e326bbc8b..62a78d943534 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -380,6 +380,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
static void __irqsoff_tracer_init(struct trace_array *tr)
{
+ tracing_max_latency = 0;
irqsoff_trace = tr;
/* make sure that the tracer is visible */
smp_wmb();
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fffcb069f1dc..ec78e244242e 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,8 +9,10 @@
#include <linux/kernel.h>
#include <linux/mmiotrace.h>
#include <linux/pci.h>
+#include <asm/atomic.h>
#include "trace.h"
+#include "trace_output.h"
struct header_iter {
struct pci_dev *dev;
@@ -19,6 +21,7 @@ struct header_iter {
static struct trace_array *mmio_trace_array;
static bool overrun_detected;
static unsigned long prev_overruns;
+static atomic_t dropped_count;
static void mmio_reset_data(struct trace_array *tr)
{
@@ -121,11 +124,11 @@ static void mmio_close(struct trace_iterator *iter)
static unsigned long count_overruns(struct trace_iterator *iter)
{
- unsigned long cnt = 0;
+ unsigned long cnt = atomic_xchg(&dropped_count, 0);
unsigned long over = ring_buffer_overruns(iter->tr->buffer);
if (over > prev_overruns)
- cnt = over - prev_overruns;
+ cnt += over - prev_overruns;
prev_overruns = over;
return cnt;
}
@@ -181,21 +184,22 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
switch (rw->opcode) {
case MMIO_READ:
ret = trace_seq_printf(s,
- "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+ "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_WRITE:
ret = trace_seq_printf(s,
- "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+ "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_UNKNOWN_OP:
ret = trace_seq_printf(s,
- "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
+ "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
+ "%02lx 0x%lx %d\n",
secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
@@ -227,14 +231,14 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
switch (m->opcode) {
case MMIO_PROBE:
ret = trace_seq_printf(s,
- "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
+ "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
secs, usec_rem, m->map_id,
(unsigned long long)m->phys, m->virt, m->len,
0UL, 0);
break;
case MMIO_UNPROBE:
ret = trace_seq_printf(s,
- "UNMAP %lu.%06lu %d 0x%lx %d\n",
+ "UNMAP %u.%06lu %d 0x%lx %d\n",
secs, usec_rem, m->map_id, 0UL, 0);
break;
default:
@@ -258,13 +262,10 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
int ret;
/* The trailing newline must be in the message. */
- ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg);
+ ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- if (entry->flags & TRACE_FLAG_CONT)
- trace_seq_print_cont(s, iter);
-
return TRACE_TYPE_HANDLED;
}
@@ -310,8 +311,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
- if (!event)
+ if (!event) {
+ atomic_inc(&dropped_count);
return;
+ }
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, preempt_count());
entry->ent.type = TRACE_MMIO_RW;
@@ -338,8 +341,10 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
- if (!event)
+ if (!event) {
+ atomic_inc(&dropped_count);
return;
+ }
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, preempt_count());
entry->ent.type = TRACE_MMIO_MAP;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
new file mode 100644
index 000000000000..1a4e144a9f8f
--- /dev/null
+++ b/kernel/trace/trace_output.c
@@ -0,0 +1,829 @@
+/*
+ * trace_output.c
+ *
+ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/ftrace.h>
+
+#include "trace_output.h"
+
+/* must be a power of 2 */
+#define EVENT_HASHSIZE 128
+
+static DEFINE_MUTEX(trace_event_mutex);
+static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
+
+static int next_event_type = __TRACE_LAST_TYPE + 1;
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+ int len = (PAGE_SIZE - 1) - s->len;
+ va_list ap;
+ int ret;
+
+ if (!len)
+ return 0;
+
+ va_start(ap, fmt);
+ ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+ va_end(ap);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len)
+ return 0;
+
+ s->len += ret;
+
+ return len;
+}
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+ int len = strlen(str);
+
+ if (len > ((PAGE_SIZE - 1) - s->len))
+ return 0;
+
+ memcpy(s->buffer + s->len, str, len);
+ s->len += len;
+
+ return len;
+}
+
+int trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+ if (s->len >= (PAGE_SIZE - 1))
+ return 0;
+
+ s->buffer[s->len++] = c;
+
+ return 1;
+}
+
+int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
+{
+ if (len > ((PAGE_SIZE - 1) - s->len))
+ return 0;
+
+ memcpy(s->buffer + s->len, mem, len);
+ s->len += len;
+
+ return len;
+}
+
+int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
+{
+ unsigned char hex[HEX_CHARS];
+ unsigned char *data = mem;
+ int i, j;
+
+#ifdef __BIG_ENDIAN
+ for (i = 0, j = 0; i < len; i++) {
+#else
+ for (i = len-1, j = 0; i >= 0; i--) {
+#endif
+ hex[j++] = hex_asc_hi(data[i]);
+ hex[j++] = hex_asc_lo(data[i]);
+ }
+ hex[j++] = ' ';
+
+ return trace_seq_putmem(s, hex, j);
+}
+
+int trace_seq_path(struct trace_seq *s, struct path *path)
+{
+ unsigned char *p;
+
+ if (s->len >= (PAGE_SIZE - 1))
+ return 0;
+ p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+ if (!IS_ERR(p)) {
+ p = mangle_path(s->buffer + s->len, p, "\n");
+ if (p) {
+ s->len = p - s->buffer;
+ return 1;
+ }
+ } else {
+ s->buffer[s->len++] = '?';
+ return 1;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_KRETPROBES
+static inline const char *kretprobed(const char *name)
+{
+ static const char tramp_name[] = "kretprobe_trampoline";
+ int size = sizeof(tramp_name);
+
+ if (strncmp(tramp_name, name, size) == 0)
+ return "[unknown/kretprobe'd]";
+ return name;
+}
+#else
+static inline const char *kretprobed(const char *name)
+{
+ return name;
+}
+#endif /* CONFIG_KRETPROBES */
+
+static int
+seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+ char str[KSYM_SYMBOL_LEN];
+ const char *name;
+
+ kallsyms_lookup(address, NULL, NULL, NULL, str);
+
+ name = kretprobed(str);
+
+ return trace_seq_printf(s, fmt, name);
+#endif
+ return 1;
+}
+
+static int
+seq_print_sym_offset(struct trace_seq *s, const char *fmt,
+ unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+ char str[KSYM_SYMBOL_LEN];
+ const char *name;
+
+ sprint_symbol(str, address);
+ name = kretprobed(str);
+
+ return trace_seq_printf(s, fmt, name);
+#endif
+ return 1;
+}
+
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
+int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+ unsigned long ip, unsigned long sym_flags)
+{
+ struct file *file = NULL;
+ unsigned long vmstart = 0;
+ int ret = 1;
+
+ if (mm) {
+ const struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, ip);
+ if (vma) {
+ file = vma->vm_file;
+ vmstart = vma->vm_start;
+ }
+ if (file) {
+ ret = trace_seq_path(s, &file->f_path);
+ if (ret)
+ ret = trace_seq_printf(s, "[+0x%lx]",
+ ip - vmstart);
+ }
+ up_read(&mm->mmap_sem);
+ }
+ if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+ ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+ return ret;
+}
+
+int
+seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
+ unsigned long sym_flags)
+{
+ struct mm_struct *mm = NULL;
+ int ret = 1;
+ unsigned int i;
+
+ if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
+ struct task_struct *task;
+ /*
+ * we do the lookup on the thread group leader,
+ * since individual threads might have already quit!
+ */
+ rcu_read_lock();
+ task = find_task_by_vpid(entry->ent.tgid);
+ if (task)
+ mm = get_task_mm(task);
+ rcu_read_unlock();
+ }
+
+ for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+ unsigned long ip = entry->caller[i];
+
+ if (ip == ULONG_MAX || !ret)
+ break;
+ if (i && ret)
+ ret = trace_seq_puts(s, " <- ");
+ if (!ip) {
+ if (ret)
+ ret = trace_seq_puts(s, "??");
+ continue;
+ }
+ if (!ret)
+ break;
+ if (ret)
+ ret = seq_print_user_ip(s, mm, ip, sym_flags);
+ }
+
+ if (mm)
+ mmput(mm);
+ return ret;
+}
+
+int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
+{
+ int ret;
+
+ if (!ip)
+ return trace_seq_printf(s, "0");
+
+ if (sym_flags & TRACE_ITER_SYM_OFFSET)
+ ret = seq_print_sym_offset(s, "%s", ip);
+ else
+ ret = seq_print_sym_short(s, "%s", ip);
+
+ if (!ret)
+ return 0;
+
+ if (sym_flags & TRACE_ITER_SYM_ADDR)
+ ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+ return ret;
+}
+
+static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+
+static int task_state_char(unsigned long state)
+{
+ int bit = state ? __ffs(state) + 1 : 0;
+
+ return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
+}
+
+/**
+ * ftrace_find_event - find a registered event
+ * @type: the type of event to look for
+ *
+ * Returns an event of type @type otherwise NULL
+ */
+struct trace_event *ftrace_find_event(int type)
+{
+ struct trace_event *event;
+ struct hlist_node *n;
+ unsigned key;
+
+ key = type & (EVENT_HASHSIZE - 1);
+
+ hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
+ if (event->type == type)
+ return event;
+ }
+
+ return NULL;
+}
+
+/**
+ * register_ftrace_event - register output for an event type
+ * @event: the event type to register
+ *
+ * Event types are stored in a hash and this hash is used to
+ * find a way to print an event. If the @event->type is set
+ * then it will use that type, otherwise it will assign a
+ * type to use.
+ *
+ * If you assign your own type, please make sure it is added
+ * to the trace_type enum in trace.h, to avoid collisions
+ * with the dynamic types.
+ *
+ * Returns the event type number or zero on error.
+ */
+int register_ftrace_event(struct trace_event *event)
+{
+ unsigned key;
+ int ret = 0;
+
+ mutex_lock(&trace_event_mutex);
+
+ if (!event->type)
+ event->type = next_event_type++;
+ else if (event->type > __TRACE_LAST_TYPE) {
+ printk(KERN_WARNING "Need to add type to trace.h\n");
+ WARN_ON(1);
+ }
+
+ if (ftrace_find_event(event->type))
+ goto out;
+
+ key = event->type & (EVENT_HASHSIZE - 1);
+
+ hlist_add_head_rcu(&event->node, &event_hash[key]);
+
+ ret = event->type;
+ out:
+ mutex_unlock(&trace_event_mutex);
+
+ return ret;
+}
+
+/**
+ * unregister_ftrace_event - remove a no longer used event
+ * @event: the event to remove
+ */
+int unregister_ftrace_event(struct trace_event *event)
+{
+ mutex_lock(&trace_event_mutex);
+ hlist_del(&event->node);
+ mutex_unlock(&trace_event_mutex);
+
+ return 0;
+}
+
+/*
+ * Standard events
+ */
+
+int
+trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ return 0;
+}
+
+/* TRACE_FN */
+static int
+trace_fn_latency(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);
+
+ if (!seq_print_ip_sym(s, field->ip, flags))
+ goto partial;
+ if (!trace_seq_puts(s, " ("))
+ goto partial;
+ if (!seq_print_ip_sym(s, field->parent_ip, flags))
+ goto partial;
+ if (!trace_seq_puts(s, ")\n"))
+ goto partial;
+
+ return 0;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_fn_trace(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);
+
+ if (!seq_print_ip_sym(s, field->ip, flags))
+ goto partial;
+
+ if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
+ if (!trace_seq_printf(s, " <-"))
+ goto partial;
+ if (!seq_print_ip_sym(s,
+ field->parent_ip,
+ flags))
+ goto partial;
+ }
+ if (!trace_seq_printf(s, "\n"))
+ goto partial;
+
+ return 0;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);
+
+ if (!trace_seq_printf(s, "%lx %lx\n",
+ field->ip,
+ field->parent_ip))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return 0;
+}
+
+static int
+trace_fn_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);
+
+ SEQ_PUT_HEX_FIELD_RET(s, field->ip);
+ SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
+
+ return 0;
+}
+
+static int
+trace_fn_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);
+
+ SEQ_PUT_FIELD_RET(s, field->ip);
+ SEQ_PUT_FIELD_RET(s, field->parent_ip);
+
+ return 0;
+}
+
+static struct trace_event trace_fn_event = {
+ .type = TRACE_FN,
+ .trace = trace_fn_trace,
+ .latency_trace = trace_fn_latency,
+ .raw = trace_fn_raw,
+ .hex = trace_fn_hex,
+ .binary = trace_fn_bin,
+};
+
+/* TRACE_CTX an TRACE_WAKE */
+static int
+trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags,
+ char *delim)
+{
+ struct ctx_switch_entry *field;
+ char *comm;
+ int S, T;
+
+ trace_assign_type(field, entry);
+
+ T = task_state_char(field->next_state);
+ S = task_state_char(field->prev_state);
+ comm = trace_find_cmdline(field->next_pid);
+ if (!trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+ field->prev_pid,
+ field->prev_prio,
+ S, delim,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T, comm))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return 0;
+}
+
+static int
+trace_ctx_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ return trace_ctxwake_print(s, entry, flags, "==>");
+}
+
+static int
+trace_wake_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ return trace_ctxwake_print(s, entry, flags, " +");
+}
+
+static int
+trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags,
+ char S)
+{
+ struct ctx_switch_entry *field;
+ int T;
+
+ trace_assign_type(field, entry);
+
+ if (!S)
+ task_state_char(field->prev_state);
+ T = task_state_char(field->next_state);
+ if (!trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
+ field->prev_pid,
+ field->prev_prio,
+ S,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return 0;
+}
+
+static int
+trace_ctx_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ return trace_ctxwake_raw(s, entry, flags, 0);
+}
+
+static int
+trace_wake_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ return trace_ctxwake_raw(s, entry, flags, '+');
+}
+
+
+static int
+trace_ctxwake_hex(struct trace_seq *s, struct trace_entry *entry, int flags,
+ char S)
+{
+ struct ctx_switch_entry *field;
+ int T;
+
+ trace_assign_type(field, entry);
+
+ if (!S)
+ task_state_char(field->prev_state);
+ T = task_state_char(field->next_state);
+
+ SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
+ SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
+ SEQ_PUT_HEX_FIELD_RET(s, S);
+ SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
+ SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
+ SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
+ SEQ_PUT_HEX_FIELD_RET(s, T);
+
+ return 0;
+}
+
+static int
+trace_ctx_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ return trace_ctxwake_hex(s, entry, flags, 0);
+}
+
+static int
+trace_wake_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ return trace_ctxwake_hex(s, entry, flags, '+');
+}
+
+static int
+trace_ctxwake_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ struct ctx_switch_entry *field;
+
+ trace_assign_type(field, entry);
+
+ SEQ_PUT_FIELD_RET(s, field->prev_pid);
+ SEQ_PUT_FIELD_RET(s, field->prev_prio);
+ SEQ_PUT_FIELD_RET(s, field->prev_state);
+ SEQ_PUT_FIELD_RET(s, field->next_pid);
+ SEQ_PUT_FIELD_RET(s, field->next_prio);
+ SEQ_PUT_FIELD_RET(s, field->next_state);
+
+ return 0;
+}
+
+static struct trace_event trace_ctx_event = {
+ .type = TRACE_CTX,
+ .trace = trace_ctx_print,
+ .latency_trace = trace_ctx_print,
+ .raw = trace_ctx_raw,
+ .hex = trace_ctx_hex,
+ .binary = trace_ctxwake_bin,
+};
+
+static struct trace_event trace_wake_event = {
+ .type = TRACE_WAKE,
+ .trace = trace_wake_print,
+ .latency_trace = trace_wake_print,
+ .raw = trace_wake_raw,
+ .hex = trace_wake_hex,
+ .binary = trace_ctxwake_bin,
+};
+
+/* TRACE_SPECIAL */
+static int
+trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);
+
+ if (!trace_seq_printf(s, "# %ld %ld %ld\n",
+ field->arg1,
+ field->arg2,
+ field->arg3))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return 0;
+}
+
+static int
+trace_special_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);
+
+ SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
+ SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
+ SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
+
+ return 0;
+}
+
+static int
+trace_special_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);
+
+ SEQ_PUT_FIELD_RET(s, field->arg1);
+ SEQ_PUT_FIELD_RET(s, field->arg2);
+ SEQ_PUT_FIELD_RET(s, field->arg3);
+
+ return 0;
+}
+
+static struct trace_event trace_special_event = {
+ .type = TRACE_SPECIAL,
+ .trace = trace_special_print,
+ .latency_trace = trace_special_print,
+ .raw = trace_special_print,
+ .hex = trace_special_hex,
+ .binary = trace_special_bin,
+};
+
+/* TRACE_STACK */
+
+static int
+trace_stack_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ struct stack_entry *field;
+ int i;
+
+ trace_assign_type(field, entry);
+
+ for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+ if (i) {
+ if (!trace_seq_puts(s, " <= "))
+ goto partial;
+
+ if (!seq_print_ip_sym(s, field->caller[i], flags))
+ goto partial;
+ }
+ if (!trace_seq_puts(s, "\n"))
+ goto partial;
+ }
+
+ return 0;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_stack_event = {
+ .type = TRACE_STACK,
+ .trace = trace_stack_print,
+ .latency_trace = trace_stack_print,
+ .raw = trace_special_print,
+ .hex = trace_special_hex,
+ .binary = trace_special_bin,
+};
+
+/* TRACE_USER_STACK */
+static int
+trace_user_stack_print(struct trace_seq *s, struct trace_entry *entry,
+ int flags)
+{
+ struct userstack_entry *field;
+
+ trace_assign_type(field, entry);
+
+ if (!seq_print_userip_objs(field, s, flags))
+ goto partial;
+
+ if (!trace_seq_putc(s, '\n'))
+ goto partial;
+
+ return 0;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_user_stack_event = {
+ .type = TRACE_USER_STACK,
+ .trace = trace_user_stack_print,
+ .latency_trace = trace_user_stack_print,
+ .raw = trace_special_print,
+ .hex = trace_special_hex,
+ .binary = trace_special_bin,
+};
+
+/* TRACE_PRINT */
+static int
+trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ struct print_entry *field;
+
+ trace_assign_type(field, entry);
+
+ if (!seq_print_ip_sym(s, field->ip, flags))
+ goto partial;
+
+ if (!trace_seq_printf(s, ": %s", field->buf))
+ goto partial;
+
+ return 0;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_print_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+ struct print_entry *field;
+
+ trace_assign_type(field, entry);
+
+ if (!trace_seq_printf(s, "# %lx %s", field->ip, field->buf))
+ goto partial;
+
+ return 0;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_print_event = {
+ .type = TRACE_PRINT,
+ .trace = trace_print_print,
+ .latency_trace = trace_print_print,
+ .raw = trace_print_raw,
+ .hex = trace_nop_print,
+ .binary = trace_nop_print,
+};
+
+static struct trace_event *events[] __initdata = {
+ &trace_fn_event,
+ &trace_ctx_event,
+ &trace_wake_event,
+ &trace_special_event,
+ &trace_stack_event,
+ &trace_user_stack_event,
+ &trace_print_event,
+ NULL
+};
+
+__init static int init_events(void)
+{
+ struct trace_event *event;
+ int i, ret;
+
+ for (i = 0; events[i]; i++) {
+ event = events[i];
+
+ ret = register_ftrace_event(event);
+ if (!ret) {
+ printk(KERN_WARNING "event %d failed to register\n",
+ event->type);
+ WARN_ON_ONCE(1);
+ }
+ }
+
+ return 0;
+}
+device_initcall(init_events);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
new file mode 100644
index 000000000000..1cbab5e3dc99
--- /dev/null
+++ b/kernel/trace/trace_output.h
@@ -0,0 +1,60 @@
+#ifndef __TRACE_EVENTS_H
+#define __TRACE_EVENTS_H
+
+#include "trace.h"
+
+typedef int (*trace_print_func)(struct trace_seq *s, struct trace_entry *entry,
+ int flags);
+
+struct trace_event {
+ struct hlist_node node;
+ int type;
+ trace_print_func trace;
+ trace_print_func latency_trace;
+ trace_print_func raw;
+ trace_print_func hex;
+ trace_print_func binary;
+};
+
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+ __attribute__ ((format (printf, 2, 3)));
+extern int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
+ unsigned long sym_flags);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+ size_t cnt);
+int trace_seq_puts(struct trace_seq *s, const char *str);
+int trace_seq_putc(struct trace_seq *s, unsigned char c);
+int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
+int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
+int trace_seq_path(struct trace_seq *s, struct path *path);
+int seq_print_userip_objs(const struct userstack_entry *entry,
+ struct trace_seq *s, unsigned long sym_flags);
+int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+ unsigned long ip, unsigned long sym_flags);
+
+struct trace_event *ftrace_find_event(int type);
+int register_ftrace_event(struct trace_event *event);
+int unregister_ftrace_event(struct trace_event *event);
+
+int
+trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags);
+
+#define MAX_MEMHEX_BYTES 8
+#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
+
+#define SEQ_PUT_FIELD_RET(s, x) \
+do { \
+ if (!trace_seq_putmem(s, &(x), sizeof(x))) \
+ return TRACE_TYPE_PARTIAL_LINE; \
+} while (0)
+
+#define SEQ_PUT_HEX_FIELD_RET(s, x) \
+do { \
+ BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
+ if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
+ return TRACE_TYPE_PARTIAL_LINE; \
+} while (0)
+
+#endif
+
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 7bda248daf55..faa6ab7a1f5c 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include "trace.h"
+#include "trace_output.h"
static struct trace_array *power_trace;
static int __read_mostly trace_power_enabled;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 43586b689e31..93cecda650b2 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -25,6 +25,7 @@ static int __read_mostly tracer_enabled;
static struct task_struct *wakeup_task;
static int wakeup_cpu;
static unsigned wakeup_prio = -1;
+static int wakeup_rt;
static raw_spinlock_t wakeup_lock =
(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
@@ -152,6 +153,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
goto out_unlock;
trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+ tracing_sched_switch_trace(wakeup_trace, data, prev, next, flags, pc);
/*
* usecs conversion is slow so we try to delay the conversion
@@ -213,6 +215,7 @@ static void wakeup_reset(struct trace_array *tr)
static void
probe_wakeup(struct rq *rq, struct task_struct *p, int success)
{
+ struct trace_array_cpu *data;
int cpu = smp_processor_id();
unsigned long flags;
long disabled;
@@ -224,7 +227,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
tracing_record_cmdline(p);
tracing_record_cmdline(current);
- if (likely(!rt_task(p)) ||
+ if ((wakeup_rt && !rt_task(p)) ||
p->prio >= wakeup_prio ||
p->prio >= current->prio)
return;
@@ -252,9 +255,12 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
local_save_flags(flags);
- wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
- trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu],
- CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+ data = wakeup_trace->data[wakeup_cpu];
+ data->preempt_timestamp = ftrace_now(cpu);
+ tracing_sched_wakeup_trace(wakeup_trace, data, p, current,
+ flags, pc);
+ trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2,
+ flags, pc);
out_locked:
__raw_spin_unlock(&wakeup_lock);
@@ -262,12 +268,6 @@ out:
atomic_dec(&wakeup_trace->data[cpu]->disabled);
}
-/*
- * save_tracer_enabled is used to save the state of the tracer_enabled
- * variable when we disable it when we open a trace output file.
- */
-static int save_tracer_enabled;
-
static void start_wakeup_tracer(struct trace_array *tr)
{
int ret;
@@ -306,13 +306,10 @@ static void start_wakeup_tracer(struct trace_array *tr)
register_ftrace_function(&trace_ops);
- if (tracing_is_enabled()) {
+ if (tracing_is_enabled())
tracer_enabled = 1;
- save_tracer_enabled = 1;
- } else {
+ else
tracer_enabled = 0;
- save_tracer_enabled = 0;
- }
return;
fail_deprobe_wake_new:
@@ -324,20 +321,32 @@ fail_deprobe:
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
- save_tracer_enabled = 0;
unregister_ftrace_function(&trace_ops);
unregister_trace_sched_switch(probe_wakeup_sched_switch);
unregister_trace_sched_wakeup_new(probe_wakeup);
unregister_trace_sched_wakeup(probe_wakeup);
}
-static int wakeup_tracer_init(struct trace_array *tr)
+static int __wakeup_tracer_init(struct trace_array *tr)
{
+ tracing_max_latency = 0;
wakeup_trace = tr;
start_wakeup_tracer(tr);
return 0;
}
+static int wakeup_tracer_init(struct trace_array *tr)
+{
+ wakeup_rt = 0;
+ return __wakeup_tracer_init(tr);
+}
+
+static int wakeup_rt_tracer_init(struct trace_array *tr)
+{
+ wakeup_rt = 1;
+ return __wakeup_tracer_init(tr);
+}
+
static void wakeup_tracer_reset(struct trace_array *tr)
{
stop_wakeup_tracer(tr);
@@ -349,28 +358,11 @@ static void wakeup_tracer_start(struct trace_array *tr)
{
wakeup_reset(tr);
tracer_enabled = 1;
- save_tracer_enabled = 1;
}
static void wakeup_tracer_stop(struct trace_array *tr)
{
tracer_enabled = 0;
- save_tracer_enabled = 0;
-}
-
-static void wakeup_tracer_open(struct trace_iterator *iter)
-{
- /* stop the trace while dumping */
- tracer_enabled = 0;
-}
-
-static void wakeup_tracer_close(struct trace_iterator *iter)
-{
- /* forget about any processes we were recording */
- if (save_tracer_enabled) {
- wakeup_reset(iter->tr);
- tracer_enabled = 1;
- }
}
static struct tracer wakeup_tracer __read_mostly =
@@ -380,8 +372,19 @@ static struct tracer wakeup_tracer __read_mostly =
.reset = wakeup_tracer_reset,
.start = wakeup_tracer_start,
.stop = wakeup_tracer_stop,
- .open = wakeup_tracer_open,
- .close = wakeup_tracer_close,
+ .print_max = 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_wakeup,
+#endif
+};
+
+static struct tracer wakeup_rt_tracer __read_mostly =
+{
+ .name = "wakeup_rt",
+ .init = wakeup_rt_tracer_init,
+ .reset = wakeup_tracer_reset,
+ .start = wakeup_tracer_start,
+ .stop = wakeup_tracer_stop,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
@@ -396,6 +399,10 @@ __init static int init_wakeup_tracer(void)
if (ret)
return ret;
+ ret = register_tracer(&wakeup_rt_tracer);
+ if (ret)
+ return ret;
+
return 0;
}
device_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 88c8eb70f54a..5013812578b1 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -9,7 +9,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
case TRACE_FN:
case TRACE_CTX:
case TRACE_WAKE:
- case TRACE_CONT:
case TRACE_STACK:
case TRACE_PRINT:
case TRACE_SPECIAL:
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
new file mode 100644
index 000000000000..eae9cef39291
--- /dev/null
+++ b/kernel/trace/trace_stat.c
@@ -0,0 +1,319 @@
+/*
+ * Infrastructure for statistic tracing (histogram output).
+ *
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Based on the code from trace_branch.c which is
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+
+#include <linux/list.h>
+#include <linux/debugfs.h>
+#include "trace_stat.h"
+#include "trace.h"
+
+
+/* List of stat entries from a tracer */
+struct trace_stat_list {
+ struct list_head list;
+ void *stat;
+};
+
+/* A stat session is the stats output in one file */
+struct tracer_stat_session {
+ struct list_head session_list;
+ struct tracer_stat *ts;
+ struct list_head stat_list;
+ struct mutex stat_mutex;
+ struct dentry *file;
+};
+
+/* All of the sessions currently in use. Each stat file embeed one session */
+static LIST_HEAD(all_stat_sessions);
+static DEFINE_MUTEX(all_stat_sessions_mutex);
+
+/* The root directory for all stat files */
+static struct dentry *stat_dir;
+
+
+static void reset_stat_session(struct tracer_stat_session *session)
+{
+ struct trace_stat_list *node, *next;
+
+ list_for_each_entry_safe(node, next, &session->stat_list, list)
+ kfree(node);
+
+ INIT_LIST_HEAD(&session->stat_list);
+}
+
+static void destroy_session(struct tracer_stat_session *session)
+{
+ debugfs_remove(session->file);
+ reset_stat_session(session);
+ mutex_destroy(&session->stat_mutex);
+ kfree(session);
+}
+
+/*
+ * For tracers that don't provide a stat_cmp callback.
+ * This one will force an immediate insertion on tail of
+ * the list.
+ */
+static int dummy_cmp(void *p1, void *p2)
+{
+ return 1;
+}
+
+/*
+ * Initialize the stat list at each trace_stat file opening.
+ * All of these copies and sorting are required on all opening
+ * since the stats could have changed between two file sessions.
+ */
+static int stat_seq_init(struct tracer_stat_session *session)
+{
+ struct trace_stat_list *iter_entry, *new_entry;
+ struct tracer_stat *ts = session->ts;
+ void *prev_stat;
+ int ret = 0;
+ int i;
+
+ mutex_lock(&session->stat_mutex);
+ reset_stat_session(session);
+
+ if (!ts->stat_cmp)
+ ts->stat_cmp = dummy_cmp;
+
+ /*
+ * The first entry. Actually this is the second, but the first
+ * one (the stat_list head) is pointless.
+ */
+ new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
+ if (!new_entry) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ INIT_LIST_HEAD(&new_entry->list);
+
+ list_add(&new_entry->list, &session->stat_list);
+
+ new_entry->stat = ts->stat_start();
+ prev_stat = new_entry->stat;
+
+ /*
+ * Iterate over the tracer stat entries and store them in a sorted
+ * list.
+ */
+ for (i = 1; ; i++) {
+ new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
+ if (!new_entry) {
+ ret = -ENOMEM;
+ goto exit_free_list;
+ }
+
+ INIT_LIST_HEAD(&new_entry->list);
+ new_entry->stat = ts->stat_next(prev_stat, i);
+
+ /* End of insertion */
+ if (!new_entry->stat)
+ break;
+
+ list_for_each_entry(iter_entry, &session->stat_list, list) {
+
+ /* Insertion with a descendent sorting */
+ if (ts->stat_cmp(new_entry->stat,
+ iter_entry->stat) > 0) {
+
+ list_add_tail(&new_entry->list,
+ &iter_entry->list);
+ break;
+
+ /* The current smaller value */
+ } else if (list_is_last(&iter_entry->list,
+ &session->stat_list)) {
+ list_add(&new_entry->list, &iter_entry->list);
+ break;
+ }
+ }
+
+ prev_stat = new_entry->stat;
+ }
+exit:
+ mutex_unlock(&session->stat_mutex);
+ return ret;
+
+exit_free_list:
+ reset_stat_session(session);
+ mutex_unlock(&session->stat_mutex);
+ return ret;
+}
+
+
+static void *stat_seq_start(struct seq_file *s, loff_t *pos)
+{
+ struct tracer_stat_session *session = s->private;
+
+ /* Prevent from tracer switch or stat_list modification */
+ mutex_lock(&session->stat_mutex);
+
+ /* If we are in the beginning of the file, print the headers */
+ if (!*pos && session->ts->stat_headers)
+ session->ts->stat_headers(s);
+
+ return seq_list_start(&session->stat_list, *pos);
+}
+
+static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
+{
+ struct tracer_stat_session *session = s->private;
+
+ return seq_list_next(p, &session->stat_list, pos);
+}
+
+static void stat_seq_stop(struct seq_file *s, void *p)
+{
+ struct tracer_stat_session *session = s->private;
+ mutex_unlock(&session->stat_mutex);
+}
+
+static int stat_seq_show(struct seq_file *s, void *v)
+{
+ struct tracer_stat_session *session = s->private;
+ struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
+
+ return session->ts->stat_show(s, l->stat);
+}
+
+static const struct seq_operations trace_stat_seq_ops = {
+ .start = stat_seq_start,
+ .next = stat_seq_next,
+ .stop = stat_seq_stop,
+ .show = stat_seq_show
+};
+
+/* The session stat is refilled and resorted at each stat file opening */
+static int tracing_stat_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ struct tracer_stat_session *session = inode->i_private;
+
+ ret = seq_open(file, &trace_stat_seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = session;
+ ret = stat_seq_init(session);
+ }
+
+ return ret;
+}
+
+/*
+ * Avoid consuming memory with our now useless list.
+ */
+static int tracing_stat_release(struct inode *i, struct file *f)
+{
+ struct tracer_stat_session *session = i->i_private;
+
+ mutex_lock(&session->stat_mutex);
+ reset_stat_session(session);
+ mutex_unlock(&session->stat_mutex);
+
+ return 0;
+}
+
+static const struct file_operations tracing_stat_fops = {
+ .open = tracing_stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_stat_release
+};
+
+static int tracing_stat_init(void)
+{
+ struct dentry *d_tracing;
+
+ d_tracing = tracing_init_dentry();
+
+ stat_dir = debugfs_create_dir("trace_stat", d_tracing);
+ if (!stat_dir)
+ pr_warning("Could not create debugfs "
+ "'trace_stat' entry\n");
+ return 0;
+}
+
+static int init_stat_file(struct tracer_stat_session *session)
+{
+ if (!stat_dir && tracing_stat_init())
+ return -ENODEV;
+
+ session->file = debugfs_create_file(session->ts->name, 0644,
+ stat_dir,
+ session, &tracing_stat_fops);
+ if (!session->file)
+ return -ENOMEM;
+ return 0;
+}
+
+int register_stat_tracer(struct tracer_stat *trace)
+{
+ struct tracer_stat_session *session, *node, *tmp;
+ int ret;
+
+ if (!trace)
+ return -EINVAL;
+
+ if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
+ return -EINVAL;
+
+ /* Already registered? */
+ mutex_lock(&all_stat_sessions_mutex);
+ list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+ if (node->ts == trace) {
+ mutex_unlock(&all_stat_sessions_mutex);
+ return -EINVAL;
+ }
+ }
+ mutex_unlock(&all_stat_sessions_mutex);
+
+ /* Init the session */
+ session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
+ if (!session)
+ return -ENOMEM;
+
+ session->ts = trace;
+ INIT_LIST_HEAD(&session->session_list);
+ INIT_LIST_HEAD(&session->stat_list);
+ mutex_init(&session->stat_mutex);
+ session->file = NULL;
+
+ ret = init_stat_file(session);
+ if (ret) {
+ destroy_session(session);
+ return ret;
+ }
+
+ /* Register */
+ mutex_lock(&all_stat_sessions_mutex);
+ list_add_tail(&session->session_list, &all_stat_sessions);
+ mutex_unlock(&all_stat_sessions_mutex);
+
+ return 0;
+}
+
+void unregister_stat_tracer(struct tracer_stat *trace)
+{
+ struct tracer_stat_session *node, *tmp;
+
+ mutex_lock(&all_stat_sessions_mutex);
+ list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+ if (node->ts == trace) {
+ list_del(&node->session_list);
+ destroy_session(node);
+ break;
+ }
+ }
+ mutex_unlock(&all_stat_sessions_mutex);
+}
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
new file mode 100644
index 000000000000..202274cf7f3d
--- /dev/null
+++ b/kernel/trace/trace_stat.h
@@ -0,0 +1,31 @@
+#ifndef __TRACE_STAT_H
+#define __TRACE_STAT_H
+
+#include <linux/seq_file.h>
+
+/*
+ * If you want to provide a stat file (one-shot statistics), fill
+ * an iterator with stat_start/stat_next and a stat_show callbacks.
+ * The others callbacks are optional.
+ */
+struct tracer_stat {
+ /* The name of your stat file */
+ const char *name;
+ /* Iteration over statistic entries */
+ void *(*stat_start)(void);
+ void *(*stat_next)(void *prev, int idx);
+ /* Compare two entries for stats sorting */
+ int (*stat_cmp)(void *p1, void *p2);
+ /* Print a stat entry */
+ int (*stat_show)(struct seq_file *s, void *p);
+ /* Print the headers of your stat entries */
+ int (*stat_headers)(struct seq_file *s);
+};
+
+/*
+ * Destroy or create a stat file
+ */
+extern int register_stat_tracer(struct tracer_stat *trace);
+extern void unregister_stat_tracer(struct tracer_stat *trace);
+
+#endif /* __TRACE_STAT_H */
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
new file mode 100644
index 000000000000..4664990fe9c5
--- /dev/null
+++ b/kernel/trace/trace_workqueue.c
@@ -0,0 +1,281 @@
+/*
+ * Workqueue statistical tracer.
+ *
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ */
+
+
+#include <trace/workqueue.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include "trace_stat.h"
+#include "trace.h"
+
+
+/* A cpu workqueue thread */
+struct cpu_workqueue_stats {
+ struct list_head list;
+/* Useful to know if we print the cpu headers */
+ bool first_entry;
+ int cpu;
+ pid_t pid;
+/* Can be inserted from interrupt or user context, need to be atomic */
+ atomic_t inserted;
+/*
+ * Don't need to be atomic, works are serialized in a single workqueue thread
+ * on a single CPU.
+ */
+ unsigned int executed;
+};
+
+/* List of workqueue threads on one cpu */
+struct workqueue_global_stats {
+ struct list_head list;
+ spinlock_t lock;
+};
+
+/* Don't need a global lock because allocated before the workqueues, and
+ * never freed.
+ */
+static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
+#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
+
+/* Insertion of a work */
+static void
+probe_workqueue_insertion(struct task_struct *wq_thread,
+ struct work_struct *work)
+{
+ int cpu = cpumask_first(&wq_thread->cpus_allowed);
+ struct cpu_workqueue_stats *node, *next;
+ unsigned long flags;
+
+ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+ list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
+ list) {
+ if (node->pid == wq_thread->pid) {
+ atomic_inc(&node->inserted);
+ goto found;
+ }
+ }
+ pr_debug("trace_workqueue: entry not found\n");
+found:
+ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+}
+
+/* Execution of a work */
+static void
+probe_workqueue_execution(struct task_struct *wq_thread,
+ struct work_struct *work)
+{
+ int cpu = cpumask_first(&wq_thread->cpus_allowed);
+ struct cpu_workqueue_stats *node, *next;
+ unsigned long flags;
+
+ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+ list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
+ list) {
+ if (node->pid == wq_thread->pid) {
+ node->executed++;
+ goto found;
+ }
+ }
+ pr_debug("trace_workqueue: entry not found\n");
+found:
+ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+}
+
+/* Creation of a cpu workqueue thread */
+static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
+{
+ struct cpu_workqueue_stats *cws;
+ unsigned long flags;
+
+ WARN_ON(cpu < 0 || cpu >= num_possible_cpus());
+
+ /* Workqueues are sometimes created in atomic context */
+ cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
+ if (!cws) {
+ pr_warning("trace_workqueue: not enough memory\n");
+ return;
+ }
+ tracing_record_cmdline(wq_thread);
+
+ INIT_LIST_HEAD(&cws->list);
+ cws->cpu = cpu;
+
+ cws->pid = wq_thread->pid;
+
+ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+ if (list_empty(&workqueue_cpu_stat(cpu)->list))
+ cws->first_entry = true;
+ list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
+ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+}
+
+/* Destruction of a cpu workqueue thread */
+static void probe_workqueue_destruction(struct task_struct *wq_thread)
+{
+ /* Workqueue only execute on one cpu */
+ int cpu = cpumask_first(&wq_thread->cpus_allowed);
+ struct cpu_workqueue_stats *node, *next;
+ unsigned long flags;
+
+ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+ list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
+ list) {
+ if (node->pid == wq_thread->pid) {
+ list_del(&node->list);
+ kfree(node);
+ goto found;
+ }
+ }
+
+ pr_debug("trace_workqueue: don't find workqueue to destroy\n");
+found:
+ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+
+}
+
+static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
+{
+ unsigned long flags;
+ struct cpu_workqueue_stats *ret = NULL;
+
+
+ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+
+ if (!list_empty(&workqueue_cpu_stat(cpu)->list))
+ ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
+ struct cpu_workqueue_stats, list);
+
+ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+
+ return ret;
+}
+
+static void *workqueue_stat_start(void)
+{
+ int cpu;
+ void *ret = NULL;
+
+ for_each_possible_cpu(cpu) {
+ ret = workqueue_stat_start_cpu(cpu);
+ if (ret)
+ return ret;
+ }
+ return NULL;
+}
+
+static void *workqueue_stat_next(void *prev, int idx)
+{
+ struct cpu_workqueue_stats *prev_cws = prev;
+ int cpu = prev_cws->cpu;
+ unsigned long flags;
+ void *ret = NULL;
+
+ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+ if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
+ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+ for (++cpu ; cpu < num_possible_cpus(); cpu++) {
+ ret = workqueue_stat_start_cpu(cpu);
+ if (ret)
+ return ret;
+ }
+ return NULL;
+ }
+ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+
+ return list_entry(prev_cws->list.next, struct cpu_workqueue_stats,
+ list);
+}
+
+static int workqueue_stat_show(struct seq_file *s, void *p)
+{
+ struct cpu_workqueue_stats *cws = p;
+ unsigned long flags;
+ int cpu = cws->cpu;
+
+ seq_printf(s, "%3d %6d %6u %s\n", cws->cpu,
+ atomic_read(&cws->inserted),
+ cws->executed,
+ trace_find_cmdline(cws->pid));
+
+ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+ if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
+ seq_printf(s, "\n");
+ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+
+ return 0;
+}
+
+static int workqueue_stat_headers(struct seq_file *s)
+{
+ seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
+ seq_printf(s, "# | | | |\n\n");
+ return 0;
+}
+
+struct tracer_stat workqueue_stats __read_mostly = {
+ .name = "workqueues",
+ .stat_start = workqueue_stat_start,
+ .stat_next = workqueue_stat_next,
+ .stat_show = workqueue_stat_show,
+ .stat_headers = workqueue_stat_headers
+};
+
+
+int __init stat_workqueue_init(void)
+{
+ if (register_stat_tracer(&workqueue_stats)) {
+ pr_warning("Unable to register workqueue stat tracer\n");
+ return 1;
+ }
+
+ return 0;
+}
+fs_initcall(stat_workqueue_init);
+
+/*
+ * Workqueues are created very early, just after pre-smp initcalls.
+ * So we must register our tracepoints at this stage.
+ */
+int __init trace_workqueue_early_init(void)
+{
+ int ret, cpu;
+
+ ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
+ if (ret)
+ goto out;
+
+ ret = register_trace_workqueue_execution(probe_workqueue_execution);
+ if (ret)
+ goto no_insertion;
+
+ ret = register_trace_workqueue_creation(probe_workqueue_creation);
+ if (ret)
+ goto no_execution;
+
+ ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
+ if (ret)
+ goto no_creation;
+
+ for_each_possible_cpu(cpu) {
+ spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
+ INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
+ }
+
+ return 0;
+
+no_creation:
+ unregister_trace_workqueue_creation(probe_workqueue_creation);
+no_execution:
+ unregister_trace_workqueue_execution(probe_workqueue_execution);
+no_insertion:
+ unregister_trace_workqueue_insertion(probe_workqueue_insertion);
+out:
+ pr_warning("trace_workqueue: unable to trace workqueues\n");
+
+ return 1;
+}
+early_initcall(trace_workqueue_early_init);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2f445833ae37..e53ee18ef431 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,6 +33,7 @@
#include <linux/kallsyms.h>
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
+#include <trace/workqueue.h>
/*
* The per-CPU workqueue (if single thread, we always use the first
@@ -125,9 +126,13 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
}
+DEFINE_TRACE(workqueue_insertion);
+
static void insert_work(struct cpu_workqueue_struct *cwq,
struct work_struct *work, struct list_head *head)
{
+ trace_workqueue_insertion(cwq->thread, work);
+
set_wq_data(work, cwq);
/*
* Ensure that we get the right work->data if we see the
@@ -259,6 +264,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
}
EXPORT_SYMBOL_GPL(queue_delayed_work_on);
+DEFINE_TRACE(workqueue_execution);
+
static void run_workqueue(struct cpu_workqueue_struct *cwq)
{
spin_lock_irq(&cwq->lock);
@@ -284,7 +291,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
*/
struct lockdep_map lockdep_map = work->lockdep_map;
#endif
-
+ trace_workqueue_execution(cwq->thread, work);
cwq->current_work = work;
list_del_init(cwq->worklist.next);
spin_unlock_irq(&cwq->lock);
@@ -765,6 +772,8 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
return cwq;
}
+DEFINE_TRACE(workqueue_creation);
+
static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -787,6 +796,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
cwq->thread = p;
+ trace_workqueue_creation(cwq->thread, cpu);
+
return 0;
}
@@ -868,6 +879,8 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
}
EXPORT_SYMBOL_GPL(__create_workqueue_key);
+DEFINE_TRACE(workqueue_destruction);
+
static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
{
/*
@@ -891,6 +904,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
* checks list_empty(), and a "normal" queue_work() can't use
* a dead CPU.
*/
+ trace_workqueue_destruction(cwq->thread);
kthread_stop(cwq->thread);
cwq->thread = NULL;
}
@@ -971,6 +985,8 @@ undo:
}
#ifdef CONFIG_SMP
+static struct workqueue_struct *work_on_cpu_wq __read_mostly;
+
struct work_for_cpu {
struct work_struct work;
long (*fn)(void *);
@@ -991,8 +1007,8 @@ static void do_work_for_cpu(struct work_struct *w)
* @fn: the function to run
* @arg: the function arg
*
- * This will return -EINVAL in the cpu is not online, or the return value
- * of @fn otherwise.
+ * This will return the value @fn returns.
+ * It is up to the caller to ensure that the cpu doesn't go offline.
*/
long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
{
@@ -1001,14 +1017,8 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
INIT_WORK(&wfc.work, do_work_for_cpu);
wfc.fn = fn;
wfc.arg = arg;
- get_online_cpus();
- if (unlikely(!cpu_online(cpu)))
- wfc.ret = -EINVAL;
- else {
- schedule_work_on(cpu, &wfc.work);
- flush_work(&wfc.work);
- }
- put_online_cpus();
+ queue_work_on(cpu, work_on_cpu_wq, &wfc.work);
+ flush_work(&wfc.work);
return wfc.ret;
}
@@ -1025,4 +1035,8 @@ void __init init_workqueues(void)
hotcpu_notifier(workqueue_cpu_callback, 0);
keventd_wq = create_workqueue("events");
BUG_ON(!keventd_wq);
+#ifdef CONFIG_SMP
+ work_on_cpu_wq = create_workqueue("work_on_cpu");
+ BUG_ON(!work_on_cpu_wq);
+#endif
}