diff options
Diffstat (limited to 'kernel')
56 files changed, 8805 insertions, 2811 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938addb9d..40b0285e41ad 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,6 +11,18 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o +CFLAGS_REMOVE_sched.o = -pg -mno-spe + +ifdef CONFIG_FTRACE +# Do not trace debug files and internal ftrace files +CFLAGS_REMOVE_lockdep.o = -pg +CFLAGS_REMOVE_lockdep_proc.o = -pg +CFLAGS_REMOVE_mutex-debug.o = -pg +CFLAGS_REMOVE_rtmutex-debug.o = -pg +CFLAGS_REMOVE_cgroup-debug.o = -pg +CFLAGS_REMOVE_sched_clock.o = -pg +endif + obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ @@ -25,7 +37,6 @@ obj-$(CONFIG_FUTEX) += futex_compat.o endif obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o -obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o @@ -36,7 +47,6 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PM) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o -obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o @@ -48,7 +58,6 @@ obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o -obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o @@ -57,7 +66,6 @@ obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o -obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o ifeq ($(CONFIG_PREEMPT_RCU),y) @@ -69,6 +77,8 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_FTRACE) += trace/ +obj-$(CONFIG_TRACING) += trace/ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/audit.c b/kernel/audit.c index b7d3709cc452..e8692a5748c2 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -572,16 +572,17 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi, skb = audit_make_reply(pid, seq, type, done, multi, payload, size); if (!skb) - return; + goto out; reply->pid = pid; reply->skb = skb; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); - if (IS_ERR(tsk)) { - kfree(reply); - kfree_skb(skb); - } + if (!IS_ERR(tsk)) + return; + kfree_skb(skb); +out: + kfree(reply); } /* diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 9ef5e0aacc3c..f7921a2ecf16 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -172,10 +172,9 @@ static void insert_hash(struct audit_chunk *chunk) struct audit_chunk *audit_tree_lookup(const struct inode *inode) { struct list_head *list = chunk_hash(inode); - struct list_head *pos; + struct audit_chunk *p; - list_for_each_rcu(pos, list) { - struct audit_chunk *p = container_of(pos, struct audit_chunk, hash); + list_for_each_entry_rcu(p, list, hash) { if (p->watch.inode == inode) { get_inotify_watch(&p->watch); return p; diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c deleted file mode 100644 index d1a7605c5b8f..000000000000 --- a/kernel/backtracetest.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Simple stack backtrace regression test module - * - * (C) Copyright 2008 Intel Corporation - * Author: Arjan van de Ven <arjan@linux.intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/delay.h> - -static struct timer_list backtrace_timer; - -static void backtrace_test_timer(unsigned long data) -{ - printk("Testing a backtrace from irq context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); - dump_stack(); -} -static int backtrace_regression_test(void) -{ - printk("====[ backtrace testing ]===========\n"); - printk("Testing a backtrace from process context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); - dump_stack(); - - init_timer(&backtrace_timer); - backtrace_timer.function = backtrace_test_timer; - mod_timer(&backtrace_timer, jiffies + 10); - - msleep(10); - printk("====[ end of backtrace testing ]====\n"); - return 0; -} - -static void exitf(void) -{ -} - -module_init(backtrace_regression_test); -module_exit(exitf); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fbc6fc8949b4..15ac0e1e4f4d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2903,7 +2903,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) cg = tsk->cgroups; parent = task_cgroup(tsk, subsys->subsys_id); - snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid); + snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid); /* Pin the hierarchy */ atomic_inc(&parent->root->sb->s_active); diff --git a/kernel/cpu.c b/kernel/cpu.c index c77bc3a1c722..787d3f4f7a63 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -192,7 +192,6 @@ static int __ref take_cpu_down(void *_param) static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; - struct task_struct *p; cpumask_t old_allowed, tmp; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; @@ -226,19 +225,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpu_clear(cpu, tmp); set_cpus_allowed_ptr(current, &tmp); - p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); + err = __stop_machine_run(take_cpu_down, &tcd_param, cpu); - if (IS_ERR(p) || cpu_online(cpu)) { + if (err || cpu_online(cpu)) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu) == NOTIFY_BAD) BUG(); - if (IS_ERR(p)) { - err = PTR_ERR(p); - goto out_allowed; - } - goto out_thread; + goto out_allowed; } /* Wait for it to sleep (leaving idle task). */ @@ -255,8 +250,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) check_for_tasks(cpu); -out_thread: - err = kthread_stop(p); out_allowed: set_cpus_allowed_ptr(current, &old_allowed); out_release: @@ -277,6 +270,7 @@ int __ref cpu_down(unsigned int cpu) cpu_maps_update_done(); return err; } +EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ /* Requires cpu_add_remove_lock to be held */ diff --git a/kernel/exit.c b/kernel/exit.c index 1510f78a0ffa..fb8de6cbf2c7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -126,6 +126,12 @@ static void __exit_signal(struct task_struct *tsk) __unhash_process(tsk); + /* + * Do this under ->siglock, we can race with another thread + * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. + */ + flush_sigqueue(&tsk->pending); + tsk->signal = NULL; tsk->sighand = NULL; spin_unlock(&sighand->siglock); @@ -133,7 +139,6 @@ static void __exit_signal(struct task_struct *tsk) __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk,TIF_SIGPENDING); - flush_sigqueue(&tsk->pending); if (sig) { flush_sigqueue(&sig->shared_pending); taskstats_tgid_free(sig); @@ -894,12 +899,9 @@ static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; - unsigned long *n = end_of_stack(current); unsigned long free; - while (*n == 0) - n++; - free = (unsigned long)n - (unsigned long)end_of_stack(current); + free = stack_not_used(current); if (free >= lowest_to_date) return; diff --git a/kernel/fork.c b/kernel/fork.c index 933e60ebccae..b98b08aecee0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -54,6 +54,7 @@ #include <linux/tty.h> #include <linux/proc_fs.h> #include <linux/blkdev.h> +#include <linux/magic.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -186,6 +187,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; + unsigned long *stackend; + int err; prepare_to_copy(orig); @@ -211,6 +214,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) goto out; setup_thread_stack(tsk, orig); + stackend = end_of_stack(tsk); + *stackend = STACK_END_MAGIC; /* for overflow detection */ #ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); @@ -660,136 +665,6 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int count_open_files(struct fdtable *fdt) -{ - int size = fdt->max_fds; - int i; - - /* Find the last open fd */ - for (i = size/(8*sizeof(long)); i > 0; ) { - if (fdt->open_fds->fds_bits[--i]) - break; - } - i = (i+1) * 8 * sizeof(long); - return i; -} - -static struct files_struct *alloc_files(void) -{ - struct files_struct *newf; - struct fdtable *fdt; - - newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); - if (!newf) - goto out; - - atomic_set(&newf->count, 1); - - spin_lock_init(&newf->file_lock); - newf->next_fd = 0; - fdt = &newf->fdtab; - fdt->max_fds = NR_OPEN_DEFAULT; - fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; - fdt->open_fds = (fd_set *)&newf->open_fds_init; - fdt->fd = &newf->fd_array[0]; - INIT_RCU_HEAD(&fdt->rcu); - fdt->next = NULL; - rcu_assign_pointer(newf->fdt, fdt); -out: - return newf; -} - -/* - * Allocate a new files structure and copy contents from the - * passed in files structure. - * errorp will be valid only when the returned files_struct is NULL. - */ -static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) -{ - struct files_struct *newf; - struct file **old_fds, **new_fds; - int open_files, size, i; - struct fdtable *old_fdt, *new_fdt; - - *errorp = -ENOMEM; - newf = alloc_files(); - if (!newf) - goto out; - - spin_lock(&oldf->file_lock); - old_fdt = files_fdtable(oldf); - new_fdt = files_fdtable(newf); - open_files = count_open_files(old_fdt); - - /* - * Check whether we need to allocate a larger fd array and fd set. - * Note: we're not a clone task, so the open count won't change. - */ - if (open_files > new_fdt->max_fds) { - new_fdt->max_fds = 0; - spin_unlock(&oldf->file_lock); - spin_lock(&newf->file_lock); - *errorp = expand_files(newf, open_files-1); - spin_unlock(&newf->file_lock); - if (*errorp < 0) - goto out_release; - new_fdt = files_fdtable(newf); - /* - * Reacquire the oldf lock and a pointer to its fd table - * who knows it may have a new bigger fd table. We need - * the latest pointer. - */ - spin_lock(&oldf->file_lock); - old_fdt = files_fdtable(oldf); - } - - old_fds = old_fdt->fd; - new_fds = new_fdt->fd; - - memcpy(new_fdt->open_fds->fds_bits, - old_fdt->open_fds->fds_bits, open_files/8); - memcpy(new_fdt->close_on_exec->fds_bits, - old_fdt->close_on_exec->fds_bits, open_files/8); - - for (i = open_files; i != 0; i--) { - struct file *f = *old_fds++; - if (f) { - get_file(f); - } else { - /* - * The fd may be claimed in the fd bitmap but not yet - * instantiated in the files array if a sibling thread - * is partway through open(). So make sure that this - * fd is available to the new process. - */ - FD_CLR(open_files - i, new_fdt->open_fds); - } - rcu_assign_pointer(*new_fds++, f); - } - spin_unlock(&oldf->file_lock); - - /* compute the remainder to be cleared */ - size = (new_fdt->max_fds - open_files) * sizeof(struct file *); - - /* This is long word aligned thus could use a optimized version */ - memset(new_fds, 0, size); - - if (new_fdt->max_fds > open_files) { - int left = (new_fdt->max_fds-open_files)/8; - int start = open_files / (8 * sizeof(unsigned long)); - - memset(&new_fdt->open_fds->fds_bits[start], 0, left); - memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); - } - - return newf; - -out_release: - kmem_cache_free(files_cachep, newf); -out: - return NULL; -} - static int copy_files(unsigned long clone_flags, struct task_struct * tsk) { struct files_struct *oldf, *newf; @@ -1039,7 +914,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, rt_mutex_init_task(p); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP) DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 421be5fe5cc7..543d9ca9b4f4 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1078,7 +1078,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) } EXPORT_SYMBOL_GPL(hrtimer_get_remaining); -#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) +#ifdef CONFIG_NO_HZ /** * hrtimer_get_next_event - get the time until next expiry event * diff --git a/kernel/kthread.c b/kernel/kthread.c index bd1b9ea024e1..9eb21645765f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -111,29 +111,10 @@ static void create_kthread(struct kthread_create_info *create) complete(&create->done); } -/** - * kthread_create - create a kthread. - * @threadfn: the function to run until signal_pending(current). - * @data: data ptr for @threadfn. - * @namefmt: printf-style name for the thread. - * - * Description: This helper function creates and names a kernel - * thread. The thread will be stopped: use wake_up_process() to start - * it. See also kthread_run(), kthread_create_on_cpu(). - * - * When woken, the thread will run @threadfn() with @data as its - * argument. @threadfn() can either call do_exit() directly if it is a - * standalone thread for which noone will call kthread_stop(), or - * return when 'kthread_should_stop()' is true (which means - * kthread_stop() has been called). The return value should be zero - * or a negative error number; it will be passed to kthread_stop(). - * - * Returns a task_struct or ERR_PTR(-ENOMEM). - */ -struct task_struct *kthread_create(int (*threadfn)(void *data), - void *data, - const char namefmt[], - ...) +struct task_struct *__kthread_create(int (*threadfn)(void *data), + void *data, + const char namefmt[], + ...) { struct kthread_create_info create; @@ -158,7 +139,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } return create.result; } -EXPORT_SYMBOL(kthread_create); +EXPORT_SYMBOL(__kthread_create); /** * kthread_bind - bind a just-created kthread to a cpu. diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 81a4e4a3f087..65548eff029e 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -39,6 +39,7 @@ #include <linux/irqflags.h> #include <linux/utsname.h> #include <linux/hash.h> +#include <linux/ftrace.h> #include <asm/sections.h> @@ -81,6 +82,8 @@ static int graph_lock(void) __raw_spin_unlock(&lockdep_lock); return 0; } + /* prevent any recursions within lockdep from causing deadlocks */ + current->lockdep_recursion++; return 1; } @@ -89,6 +92,7 @@ static inline int graph_unlock(void) if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) return DEBUG_LOCKS_WARN_ON(1); + current->lockdep_recursion--; __raw_spin_unlock(&lockdep_lock); return 0; } @@ -982,7 +986,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) return 1; } -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency @@ -1680,7 +1684,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * print irq inversion bug: @@ -2013,11 +2017,13 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void trace_hardirqs_on(void) +void trace_hardirqs_on_caller(unsigned long a0) { struct task_struct *curr = current; unsigned long ip; + time_hardirqs_on(CALLER_ADDR0, a0); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2055,16 +2061,23 @@ void trace_hardirqs_on(void) curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_on_events); } +EXPORT_SYMBOL(trace_hardirqs_on_caller); +void trace_hardirqs_on(void) +{ + trace_hardirqs_on_caller(CALLER_ADDR0); +} EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off(void) +void trace_hardirqs_off_caller(unsigned long a0) { struct task_struct *curr = current; + time_hardirqs_off(CALLER_ADDR0, a0); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2082,7 +2095,12 @@ void trace_hardirqs_off(void) } else debug_atomic_inc(&redundant_hardirqs_off); } +EXPORT_SYMBOL(trace_hardirqs_off_caller); +void trace_hardirqs_off(void) +{ + trace_hardirqs_off_caller(CALLER_ADDR0); +} EXPORT_SYMBOL(trace_hardirqs_off); /* @@ -2246,7 +2264,7 @@ static inline int separate_irq_context(struct task_struct *curr, * Mark a lock with a usage bit, and validate the state transition: */ static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) + enum lock_usage_bit new_bit) { unsigned int new_mask = 1 << new_bit, ret = 1; @@ -2686,7 +2704,7 @@ static void check_flags(unsigned long flags) * and also avoid lockdep recursion: */ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, unsigned long ip) + int trylock, int read, int check, unsigned long ip) { unsigned long flags; @@ -2708,7 +2726,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, EXPORT_SYMBOL_GPL(lock_acquire); -void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +void lock_release(struct lockdep_map *lock, int nested, + unsigned long ip) { unsigned long flags; diff --git a/kernel/marker.c b/kernel/marker.c index b5a9fe1d50d5..1abfb923b761 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -55,8 +55,8 @@ static DEFINE_MUTEX(markers_mutex); struct marker_entry { struct hlist_node hlist; char *format; - void (*call)(const struct marker *mdata, /* Probe wrapper */ - void *call_private, const char *fmt, ...); + /* Probe wrapper */ + void (*call)(const struct marker *mdata, void *call_private, ...); struct marker_probe_closure single; struct marker_probe_closure *multi; int refcount; /* Number of times armed. 0 if disarmed. */ @@ -91,15 +91,13 @@ EXPORT_SYMBOL_GPL(__mark_empty_function); * marker_probe_cb Callback that prepares the variable argument list for probes. * @mdata: pointer of type struct marker * @call_private: caller site private data - * @fmt: format string * @...: Variable argument list. * * Since we do not use "typical" pointer based RCU in the 1 argument case, we * need to put a full smp_rmb() in this branch. This is why we do not use * rcu_dereference() for the pointer read. */ -void marker_probe_cb(const struct marker *mdata, void *call_private, - const char *fmt, ...) +void marker_probe_cb(const struct marker *mdata, void *call_private, ...) { va_list args; char ptype; @@ -120,8 +118,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); - va_start(args, fmt); - func(mdata->single.probe_private, call_private, fmt, &args); + va_start(args, call_private); + func(mdata->single.probe_private, call_private, mdata->format, + &args); va_end(args); } else { struct marker_probe_closure *multi; @@ -136,9 +135,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, smp_read_barrier_depends(); multi = mdata->multi; for (i = 0; multi[i].func; i++) { - va_start(args, fmt); - multi[i].func(multi[i].probe_private, call_private, fmt, - &args); + va_start(args, call_private); + multi[i].func(multi[i].probe_private, call_private, + mdata->format, &args); va_end(args); } } @@ -150,13 +149,11 @@ EXPORT_SYMBOL_GPL(marker_probe_cb); * marker_probe_cb Callback that does not prepare the variable argument list. * @mdata: pointer of type struct marker * @call_private: caller site private data - * @fmt: format string * @...: Variable argument list. * * Should be connected to markers "MARK_NOARGS". */ -void marker_probe_cb_noarg(const struct marker *mdata, - void *call_private, const char *fmt, ...) +void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) { va_list args; /* not initialized */ char ptype; @@ -172,7 +169,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); - func(mdata->single.probe_private, call_private, fmt, &args); + func(mdata->single.probe_private, call_private, mdata->format, + &args); } else { struct marker_probe_closure *multi; int i; @@ -186,8 +184,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, smp_read_barrier_depends(); multi = mdata->multi; for (i = 0; multi[i].func; i++) - multi[i].func(multi[i].probe_private, call_private, fmt, - &args); + multi[i].func(multi[i].probe_private, call_private, + mdata->format, &args); } preempt_enable(); } diff --git a/kernel/module.c b/kernel/module.c index f5e9491ef7ac..5f80478b746d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1337,7 +1337,19 @@ out_unreg: kobject_put(&mod->mkobj.kobj); return err; } -#endif + +static void mod_sysfs_fini(struct module *mod) +{ + kobject_put(&mod->mkobj.kobj); +} + +#else /* CONFIG_SYSFS */ + +static void mod_sysfs_fini(struct module *mod) +{ +} + +#endif /* CONFIG_SYSFS */ static void mod_kobject_remove(struct module *mod) { @@ -1345,7 +1357,7 @@ static void mod_kobject_remove(struct module *mod) module_param_sysfs_remove(mod); kobject_put(mod->mkobj.drivers_dir); kobject_put(mod->holders_dir); - kobject_put(&mod->mkobj.kobj); + mod_sysfs_fini(mod); } /* @@ -1780,7 +1792,7 @@ static struct module *load_module(void __user *umod, /* Sanity checks against insmoding binaries or wrong arch, weird elf version */ - if (memcmp(hdr->e_ident, ELFMAG, 4) != 0 + if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 || hdr->e_type != ET_REL || !elf_check_arch(hdr) || hdr->e_shentsize != sizeof(*sechdrs)) { diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 3aaa06c561de..1d94160eb532 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -79,8 +79,8 @@ void debug_mutex_unlock(struct mutex *lock) if (unlikely(!debug_locks)) return; - DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); DEBUG_LOCKS_WARN_ON(lock->magic != lock); + DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); } diff --git a/kernel/mutex.c b/kernel/mutex.c index d046a345d365..9e98f985c5bf 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -373,8 +373,8 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) * Try to acquire the mutex atomically. Returns 1 if the mutex * has been acquired successfully, and 0 on contention. * - * NOTE: this function follows the spin_trylock() convention, so - * it is negated to the down_trylock() return values! Be careful + * NOTE: this function follows the spin_trylock()/down_try() convention, + * so it is negated to the old down_trylock() return values! Be careful * about this when converting semaphore users to mutexes. * * This function must not be used in interrupt context. The diff --git a/kernel/panic.c b/kernel/panic.c index 425567f45b9f..6729e3f4ebcb 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -80,6 +80,9 @@ NORET_TYPE void panic(const char * fmt, ...) vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); +#ifdef CONFIG_DEBUG_BUGVERBOSE + dump_stack(); +#endif bust_spinlocks(0); /* @@ -321,13 +324,85 @@ EXPORT_SYMBOL(warn_on_slowpath); #endif #ifdef CONFIG_CC_STACKPROTECTOR + +#ifndef GCC_HAS_SP +#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this. +#endif +static unsigned long __stack_check_testing; +/* + * Self test function for the stack-protector feature. + * This test requires that the local variable absolutely has + * a stack slot, hence the barrier()s. + */ +static noinline void __stack_chk_test_func(void) +{ + unsigned long foo; + barrier(); + /* + * we need to make sure we're not about to clobber the return address, + * while real exploits do this, it's unhealthy on a running system. + * Besides, if we would, the test is already failed anyway so + * time to pull the emergency brake on it. + */ + if ((unsigned long)__builtin_return_address(0) == + *(((unsigned long *)&foo)+1)) { + printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); + return; + } +#ifdef CONFIG_FRAME_POINTER + /* We also don't want to clobber the frame pointer */ + if ((unsigned long)__builtin_return_address(0) == + *(((unsigned long *)&foo)+2)) { + printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); + return; + } +#endif + barrier(); + if (current->stack_canary == *(((unsigned long *)&foo)+1)) + *(((unsigned long *)&foo)+1) = 0; + else + printk(KERN_ERR "No -fstack-protector canary found\n"); + barrier(); +} + +static int __stack_chk_test(void) +{ + printk(KERN_INFO "Testing -fstack-protector-all feature\n"); + __stack_check_testing = (unsigned long)&__stack_chk_test_func; + __stack_chk_test_func(); + if (__stack_check_testing) { + printk(KERN_ERR "-fstack-protector-all test failed\n"); + WARN_ON(1); + } + return 0; +} /* * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value */ void __stack_chk_fail(void) { - panic("stack-protector: Kernel stack is corrupted"); + if (__stack_check_testing == (unsigned long)&__stack_chk_test_func) { + long delta; + + delta = (unsigned long)__builtin_return_address(0) - + __stack_check_testing; + /* + * The test needs to happen inside the test function, so + * check if the return address is close to that function. + * The function is only 2 dozen bytes long, but keep a wide + * safety margin to avoid panic()s for normal users regardless + * of the quality of the compiler. + */ + if (delta >= 0 && delta <= 400) { + __stack_check_testing = 0; + return; + } + } + panic("stack-protector: Kernel stack is corrupted in: %p\n", + __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); + +late_initcall(__stack_chk_test); #endif diff --git a/kernel/pid.c b/kernel/pid.c index 20d59fa2d493..30bd5d4b2ac7 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -30,6 +30,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> +#include <linux/rculist.h> #include <linux/bootmem.h> #include <linux/hash.h> #include <linux/pid_namespace.h> diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 14a656cdc652..d416be0efa8a 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -193,6 +193,7 @@ static int create_image(int platform_mode) if (error) return error; + device_pm_lock(); local_irq_disable(); /* At this point, device_suspend() has been called, but *not* * device_power_down(). We *must* call device_power_down() now. @@ -224,9 +225,11 @@ static int create_image(int platform_mode) /* NOTE: device_power_up() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ - device_power_up(); + device_power_up(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); Enable_irqs: local_irq_enable(); + device_pm_unlock(); return error; } @@ -280,7 +283,8 @@ int hibernation_snapshot(int platform_mode) Finish: platform_finish(platform_mode); Resume_devices: - device_resume(); + device_resume(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); Resume_console: resume_console(); Close: @@ -300,8 +304,9 @@ static int resume_target_kernel(void) { int error; + device_pm_lock(); local_irq_disable(); - error = device_power_down(PMSG_PRETHAW); + error = device_power_down(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -329,9 +334,10 @@ static int resume_target_kernel(void) swsusp_free(); restore_processor_state(); touch_softlockup_watchdog(); - device_power_up(); + device_power_up(PMSG_RECOVER); Enable_irqs: local_irq_enable(); + device_pm_unlock(); return error; } @@ -350,7 +356,7 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); - error = device_suspend(PMSG_PRETHAW); + error = device_suspend(PMSG_QUIESCE); if (error) goto Finish; @@ -362,7 +368,7 @@ int hibernation_restore(int platform_mode) enable_nonboot_cpus(); } platform_restore_cleanup(platform_mode); - device_resume(); + device_resume(PMSG_RECOVER); Finish: resume_console(); pm_restore_console(); @@ -403,6 +409,7 @@ int hibernation_platform_enter(void) if (error) goto Finish; + device_pm_lock(); local_irq_disable(); error = device_power_down(PMSG_HIBERNATE); if (!error) { @@ -411,6 +418,7 @@ int hibernation_platform_enter(void) while (1); } local_irq_enable(); + device_pm_unlock(); /* * We don't need to reenable the nonboot CPUs or resume consoles, since @@ -419,7 +427,7 @@ int hibernation_platform_enter(void) Finish: hibernation_ops->finish(); Resume_devices: - device_resume(); + device_resume(PMSG_RESTORE); Resume_console: resume_console(); Close: diff --git a/kernel/power/main.c b/kernel/power/main.c index 6a6d5eb3524e..d023b6b584e5 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -228,6 +228,7 @@ static int suspend_enter(suspend_state_t state) { int error = 0; + device_pm_lock(); arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); @@ -239,10 +240,11 @@ static int suspend_enter(suspend_state_t state) if (!suspend_test(TEST_CORE)) error = suspend_ops->enter(state); - device_power_up(); + device_power_up(PMSG_RESUME); Done: arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); + device_pm_unlock(); return error; } @@ -291,7 +293,7 @@ int suspend_devices_and_enter(suspend_state_t state) if (suspend_ops->finish) suspend_ops->finish(); Resume_devices: - device_resume(); + device_resume(PMSG_RESUME); Resume_console: resume_console(); Close: diff --git a/kernel/printk.c b/kernel/printk.c index 8fb01c32aa3b..c5b4aea992a6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -121,6 +121,8 @@ struct console_cmdline static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int selected_console = -1; static int preferred_console = -1; +int console_set_on_cmdline; +EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; @@ -890,6 +892,7 @@ static int __init console_setup(char *str) *s = 0; __add_preferred_console(buf, idx, options, brl_options); + console_set_on_cmdline = 1; return 1; } __setup("console=", console_setup); @@ -986,7 +989,7 @@ EXPORT_SYMBOL(acquire_console_sem); int try_acquire_console_sem(void) { - if (down_trylock(&console_sem)) + if (!down_try(&console_sem)) return -1; console_locked = 1; console_may_schedule = 0; @@ -1041,7 +1044,9 @@ void release_console_sem(void) _log_end = log_end; con_start = log_end; /* Flush */ spin_unlock(&logbuf_lock); + stop_critical_timings(); /* don't trace print latency */ call_console_drivers(_con_start, _log_end); + start_critical_timings(); local_irq_restore(flags); } console_locked = 0; @@ -1083,7 +1088,7 @@ void console_unblank(void) * oops_in_progress is set to 1.. */ if (oops_in_progress) { - if (down_trylock(&console_sem) != 0) + if (!down_try(&console_sem)) return; } else acquire_console_sem(); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6c19e94fd0a5..e337390fce01 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -121,7 +121,7 @@ int ptrace_check_attach(struct task_struct *child, int kill) return ret; } -int __ptrace_may_attach(struct task_struct *task) +int __ptrace_may_access(struct task_struct *task, unsigned int mode) { /* May we inspect the given task? * This check is used both for attaching with ptrace @@ -148,16 +148,16 @@ int __ptrace_may_attach(struct task_struct *task) if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; - return security_ptrace(current, task); + return security_ptrace(current, task, mode); } -int ptrace_may_attach(struct task_struct *task) +bool ptrace_may_access(struct task_struct *task, unsigned int mode) { int err; task_lock(task); - err = __ptrace_may_attach(task); + err = __ptrace_may_access(task, mode); task_unlock(task); - return !err; + return (!err ? true : false); } int ptrace_attach(struct task_struct *task) @@ -195,7 +195,7 @@ repeat: /* the same process cannot be attached many times */ if (task->ptrace & PT_PTRACED) goto bad; - retval = __ptrace_may_attach(task); + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); if (retval) goto bad; @@ -494,7 +494,8 @@ int ptrace_traceme(void) */ task_lock(current); if (!(current->ptrace & PT_PTRACED)) { - ret = security_ptrace(current->parent, current); + ret = security_ptrace(current->parent, current, + PTRACE_MODE_ATTACH); /* * Set the ptrace bit in the process ptrace flags. */ diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index f4ffbd0f306f..d8348792f9f5 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -502,10 +502,38 @@ void rcu_check_callbacks(int cpu, int user) if (user || (idle_cpu(cpu) && !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + + /* + * Get here if this CPU took its interrupt from user + * mode or from the idle loop, and if this is not a + * nested interrupt. In this case, the CPU is in + * a quiescent state, so count it. + * + * Also do a memory barrier. This is needed to handle + * the case where writes from a preempt-disable section + * of code get reordered into schedule() by this CPU's + * write buffer. The memory barrier makes sure that + * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see + * by other CPUs to happen after any such write. + */ + + smp_mb(); /* See above block comment. */ rcu_qsctr_inc(cpu); rcu_bh_qsctr_inc(cpu); - } else if (!in_softirq()) + + } else if (!in_softirq()) { + + /* + * Get here if this CPU did not take its interrupt from + * softirq, in other words, if it is not interrupting + * a rcu_bh read-side critical section. This is an _bh + * critical section, so count it. The memory barrier + * is needed for the same reason as is the above one. + */ + + smp_mb(); /* See above block comment. */ rcu_bh_qsctr_inc(cpu); + } raise_rcu_softirq(); } diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index c09605f8d16c..4a74b8d48d90 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -39,16 +39,16 @@ #include <linux/sched.h> #include <asm/atomic.h> #include <linux/bitops.h> -#include <linux/completion.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/module.h> -struct rcu_synchronize { - struct rcu_head head; - struct completion completion; +enum rcu_barrier { + RCU_BARRIER_STD, + RCU_BARRIER_BH, + RCU_BARRIER_SCHED, }; static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; @@ -60,7 +60,7 @@ static struct completion rcu_barrier_completion; * Awaken the corresponding synchronize_rcu() instance now that a * grace period has elapsed. */ -static void wakeme_after_rcu(struct rcu_head *head) +void wakeme_after_rcu(struct rcu_head *head) { struct rcu_synchronize *rcu; @@ -77,17 +77,7 @@ static void wakeme_after_rcu(struct rcu_head *head) * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ -void synchronize_rcu(void) -{ - struct rcu_synchronize rcu; - - init_completion(&rcu.completion); - /* Will wake me after RCU finished */ - call_rcu(&rcu.head, wakeme_after_rcu); - - /* Wait for it */ - wait_for_completion(&rcu.completion); -} +synchronize_rcu_xxx(synchronize_rcu, call_rcu) EXPORT_SYMBOL_GPL(synchronize_rcu); static void rcu_barrier_callback(struct rcu_head *notused) @@ -99,19 +89,30 @@ static void rcu_barrier_callback(struct rcu_head *notused) /* * Called with preemption disabled, and from cross-cpu IRQ context. */ -static void rcu_barrier_func(void *notused) +static void rcu_barrier_func(void *type) { int cpu = smp_processor_id(); struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); atomic_inc(&rcu_barrier_cpu_count); - call_rcu(head, rcu_barrier_callback); + switch ((enum rcu_barrier)type) { + case RCU_BARRIER_STD: + call_rcu(head, rcu_barrier_callback); + break; + case RCU_BARRIER_BH: + call_rcu_bh(head, rcu_barrier_callback); + break; + case RCU_BARRIER_SCHED: + call_rcu_sched(head, rcu_barrier_callback); + break; + } } -/** - * rcu_barrier - Wait until all the in-flight RCUs are complete. +/* + * Orchestrate the specified type of RCU barrier, waiting for all + * RCU callbacks of the specified type to complete. */ -void rcu_barrier(void) +static void _rcu_barrier(enum rcu_barrier type) { BUG_ON(in_interrupt()); /* Take cpucontrol mutex to protect against CPU hotplug */ @@ -127,13 +128,39 @@ void rcu_barrier(void) * until all the callbacks are queued. */ rcu_read_lock(); - on_each_cpu(rcu_barrier_func, NULL, 0, 1); + on_each_cpu(rcu_barrier_func, (void *)type, 0, 1); rcu_read_unlock(); wait_for_completion(&rcu_barrier_completion); mutex_unlock(&rcu_barrier_mutex); } + +/** + * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + */ +void rcu_barrier(void) +{ + _rcu_barrier(RCU_BARRIER_STD); +} EXPORT_SYMBOL_GPL(rcu_barrier); +/** + * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. + */ +void rcu_barrier_bh(void) +{ + _rcu_barrier(RCU_BARRIER_BH); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +/** + * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. + */ +void rcu_barrier_sched(void) +{ + _rcu_barrier(RCU_BARRIER_SCHED); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + void __init rcu_init(void) { __rcu_init(); diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index e1cdf196a515..b8e4cdae4e8d 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -46,11 +46,11 @@ #include <asm/atomic.h> #include <linux/bitops.h> #include <linux/module.h> +#include <linux/kthread.h> #include <linux/completion.h> #include <linux/moduleparam.h> #include <linux/percpu.h> #include <linux/notifier.h> -#include <linux/rcupdate.h> #include <linux/cpu.h> #include <linux/random.h> #include <linux/delay.h> @@ -87,9 +87,14 @@ struct rcu_data { struct rcu_head **nexttail; struct rcu_head *waitlist[GP_STAGES]; struct rcu_head **waittail[GP_STAGES]; - struct rcu_head *donelist; + struct rcu_head *donelist; /* from waitlist & waitschedlist */ struct rcu_head **donetail; long rcu_flipctr[2]; + struct rcu_head *nextschedlist; + struct rcu_head **nextschedtail; + struct rcu_head *waitschedlist; + struct rcu_head **waitschedtail; + int rcu_sched_sleeping; #ifdef CONFIG_RCU_TRACE struct rcupreempt_trace trace; #endif /* #ifdef CONFIG_RCU_TRACE */ @@ -131,11 +136,24 @@ enum rcu_try_flip_states { rcu_try_flip_waitmb_state, }; +/* + * States for rcu_ctrlblk.rcu_sched_sleep. + */ + +enum rcu_sched_sleep_states { + rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */ + rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */ + rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */ +}; + struct rcu_ctrlblk { spinlock_t fliplock; /* Protect state-machine transitions. */ long completed; /* Number of last completed batch. */ enum rcu_try_flip_states rcu_try_flip_state; /* The current state of the rcu state machine */ + spinlock_t schedlock; /* Protect rcu_sched sleep state. */ + enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */ + wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */ }; static DEFINE_PER_CPU(struct rcu_data, rcu_data); @@ -143,8 +161,12 @@ static struct rcu_ctrlblk rcu_ctrlblk = { .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), .completed = 0, .rcu_try_flip_state = rcu_try_flip_idle_state, + .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock), + .sched_sleep = rcu_sched_not_sleeping, + .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq), }; +static struct task_struct *rcu_sched_grace_period_task; #ifdef CONFIG_RCU_TRACE static char *rcu_try_flip_state_names[] = @@ -207,6 +229,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag) */ #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); +#define RCU_SCHED_BATCH_TIME (HZ / 50) + /* * Return the number of RCU batches processed thus far. Useful * for debug and statistics. @@ -217,8 +241,6 @@ long rcu_batches_completed(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed); -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); - void __rcu_read_lock(void) { int idx; @@ -413,32 +435,34 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp) } } -#ifdef CONFIG_NO_HZ +DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = { + .dynticks = 1, +}; -DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; -static DEFINE_PER_CPU(long, rcu_dyntick_snapshot); +#ifdef CONFIG_NO_HZ static DEFINE_PER_CPU(int, rcu_update_flag); /** * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. * * If the CPU was idle with dynamic ticks active, this updates the - * dynticks_progress_counter to let the RCU handling know that the + * rcu_dyntick_sched.dynticks to let the RCU handling know that the * CPU is active. */ void rcu_irq_enter(void) { int cpu = smp_processor_id(); + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); if (per_cpu(rcu_update_flag, cpu)) per_cpu(rcu_update_flag, cpu)++; /* * Only update if we are coming from a stopped ticks mode - * (dynticks_progress_counter is even). + * (rcu_dyntick_sched.dynticks is even). */ if (!in_interrupt() && - (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { + (rdssp->dynticks & 0x1) == 0) { /* * The following might seem like we could have a race * with NMI/SMIs. But this really isn't a problem. @@ -461,12 +485,12 @@ void rcu_irq_enter(void) * RCU read-side critical sections on this CPU would * have already completed. */ - per_cpu(dynticks_progress_counter, cpu)++; + rdssp->dynticks++; /* * The following memory barrier ensures that any * rcu_read_lock() primitives in the irq handler * are seen by other CPUs to follow the above - * increment to dynticks_progress_counter. This is + * increment to rcu_dyntick_sched.dynticks. This is * required in order for other CPUs to correctly * determine when it is safe to advance the RCU * grace-period state machine. @@ -474,7 +498,7 @@ void rcu_irq_enter(void) smp_mb(); /* see above block comment. */ /* * Since we can't determine the dynamic tick mode from - * the dynticks_progress_counter after this routine, + * the rcu_dyntick_sched.dynticks after this routine, * we use a second flag to acknowledge that we came * from an idle state with ticks stopped. */ @@ -482,7 +506,7 @@ void rcu_irq_enter(void) /* * If we take an NMI/SMI now, they will also increment * the rcu_update_flag, and will not update the - * dynticks_progress_counter on exit. That is for + * rcu_dyntick_sched.dynticks on exit. That is for * this IRQ to do. */ } @@ -492,12 +516,13 @@ void rcu_irq_enter(void) * rcu_irq_exit - Called from exiting Hard irq context. * * If the CPU was idle with dynamic ticks active, update the - * dynticks_progress_counter to put let the RCU handling be + * rcu_dyntick_sched.dynticks to put let the RCU handling be * aware that the CPU is going back to idle with no ticks. */ void rcu_irq_exit(void) { int cpu = smp_processor_id(); + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); /* * rcu_update_flag is set if we interrupted the CPU @@ -505,7 +530,7 @@ void rcu_irq_exit(void) * Once this occurs, we keep track of interrupt nesting * because a NMI/SMI could also come in, and we still * only want the IRQ that started the increment of the - * dynticks_progress_counter to be the one that modifies + * rcu_dyntick_sched.dynticks to be the one that modifies * it on exit. */ if (per_cpu(rcu_update_flag, cpu)) { @@ -517,28 +542,29 @@ void rcu_irq_exit(void) /* * If an NMI/SMI happens now we are still - * protected by the dynticks_progress_counter being odd. + * protected by the rcu_dyntick_sched.dynticks being odd. */ /* * The following memory barrier ensures that any * rcu_read_unlock() primitives in the irq handler * are seen by other CPUs to preceed the following - * increment to dynticks_progress_counter. This + * increment to rcu_dyntick_sched.dynticks. This * is required in order for other CPUs to determine * when it is safe to advance the RCU grace-period * state machine. */ smp_mb(); /* see above block comment. */ - per_cpu(dynticks_progress_counter, cpu)++; - WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); + rdssp->dynticks++; + WARN_ON(rdssp->dynticks & 0x1); } } static void dyntick_save_progress_counter(int cpu) { - per_cpu(rcu_dyntick_snapshot, cpu) = - per_cpu(dynticks_progress_counter, cpu); + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + rdssp->dynticks_snap = rdssp->dynticks; } static inline int @@ -546,9 +572,10 @@ rcu_try_flip_waitack_needed(int cpu) { long curr; long snap; + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - curr = per_cpu(dynticks_progress_counter, cpu); - snap = per_cpu(rcu_dyntick_snapshot, cpu); + curr = rdssp->dynticks; + snap = rdssp->dynticks_snap; smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ /* @@ -569,7 +596,7 @@ rcu_try_flip_waitack_needed(int cpu) * that this CPU already acknowledged the counter. */ - if ((curr - snap) > 2 || (snap & 0x1) == 0) + if ((curr - snap) > 2 || (curr & 0x1) == 0) return 0; /* We need this CPU to explicitly acknowledge the counter flip. */ @@ -582,9 +609,10 @@ rcu_try_flip_waitmb_needed(int cpu) { long curr; long snap; + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - curr = per_cpu(dynticks_progress_counter, cpu); - snap = per_cpu(rcu_dyntick_snapshot, cpu); + curr = rdssp->dynticks; + snap = rdssp->dynticks_snap; smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ /* @@ -611,14 +639,86 @@ rcu_try_flip_waitmb_needed(int cpu) return 1; } +static void dyntick_save_progress_counter_sched(int cpu) +{ + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + rdssp->sched_dynticks_snap = rdssp->dynticks; +} + +static int rcu_qsctr_inc_needed_dyntick(int cpu) +{ + long curr; + long snap; + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + curr = rdssp->dynticks; + snap = rdssp->sched_dynticks_snap; + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + + /* + * If the CPU remained in dynticks mode for the entire time + * and didn't take any interrupts, NMIs, SMIs, or whatever, + * then it cannot be in the middle of an rcu_read_lock(), so + * the next rcu_read_lock() it executes must use the new value + * of the counter. Therefore, this CPU has been in a quiescent + * state the entire time, and we don't need to wait for it. + */ + + if ((curr == snap) && ((curr & 0x1) == 0)) + return 0; + + /* + * If the CPU passed through or entered a dynticks idle phase with + * no active irq handlers, then, as above, this CPU has already + * passed through a quiescent state. + */ + + if ((curr - snap) > 2 || (snap & 0x1) == 0) + return 0; + + /* We need this CPU to go through a quiescent state. */ + + return 1; +} + #else /* !CONFIG_NO_HZ */ -# define dyntick_save_progress_counter(cpu) do { } while (0) -# define rcu_try_flip_waitack_needed(cpu) (1) -# define rcu_try_flip_waitmb_needed(cpu) (1) +# define dyntick_save_progress_counter(cpu) do { } while (0) +# define rcu_try_flip_waitack_needed(cpu) (1) +# define rcu_try_flip_waitmb_needed(cpu) (1) + +# define dyntick_save_progress_counter_sched(cpu) do { } while (0) +# define rcu_qsctr_inc_needed_dyntick(cpu) (1) #endif /* CONFIG_NO_HZ */ +static void save_qsctr_sched(int cpu) +{ + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + rdssp->sched_qs_snap = rdssp->sched_qs; +} + +static inline int rcu_qsctr_inc_needed(int cpu) +{ + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + /* + * If there has been a quiescent state, no more need to wait + * on this CPU. + */ + + if (rdssp->sched_qs != rdssp->sched_qs_snap) { + smp_mb(); /* force ordering with cpu entering schedule(). */ + return 0; + } + + /* We need this CPU to go through a quiescent state. */ + + return 1; +} + /* * Get here when RCU is idle. Decide whether we need to * move out of idle state, and return non-zero if so. @@ -821,6 +921,26 @@ void rcu_check_callbacks(int cpu, int user) unsigned long flags; struct rcu_data *rdp = RCU_DATA_CPU(cpu); + /* + * If this CPU took its interrupt from user mode or from the + * idle loop, and this is not a nested interrupt, then + * this CPU has to have exited all prior preept-disable + * sections of code. So increment the counter to note this. + * + * The memory barrier is needed to handle the case where + * writes from a preempt-disable section of code get reordered + * into schedule() by this CPU's write buffer. So the memory + * barrier makes sure that the rcu_qsctr_inc() is seen by other + * CPUs to happen after any such write. + */ + + if (user || + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + smp_mb(); /* Guard against aggressive schedule(). */ + rcu_qsctr_inc(cpu); + } + rcu_check_mb(cpu); if (rcu_ctrlblk.completed == rdp->completed) rcu_try_flip(); @@ -871,6 +991,8 @@ void rcu_offline_cpu(int cpu) struct rcu_head *list = NULL; unsigned long flags; struct rcu_data *rdp = RCU_DATA_CPU(cpu); + struct rcu_head *schedlist = NULL; + struct rcu_head **schedtail = &schedlist; struct rcu_head **tail = &list; /* @@ -884,6 +1006,11 @@ void rcu_offline_cpu(int cpu) rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], list, tail); rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); + rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail, + schedlist, schedtail); + rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail, + schedlist, schedtail); + rdp->rcu_sched_sleeping = 0; spin_unlock_irqrestore(&rdp->lock, flags); rdp->waitlistcount = 0; @@ -918,22 +1045,40 @@ void rcu_offline_cpu(int cpu) * fix. */ - local_irq_save(flags); + local_irq_save(flags); /* disable preempt till we know what lock. */ rdp = RCU_DATA_ME(); spin_lock(&rdp->lock); *rdp->nexttail = list; if (list) rdp->nexttail = tail; + *rdp->nextschedtail = schedlist; + if (schedlist) + rdp->nextschedtail = schedtail; spin_unlock_irqrestore(&rdp->lock, flags); } void __devinit rcu_online_cpu(int cpu) { unsigned long flags; + struct rcu_data *rdp; spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); cpu_set(cpu, rcu_cpu_online_map); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); + + /* + * The rcu_sched grace-period processing might have bypassed + * this CPU, given that it was not in the rcu_cpu_online_map + * when the grace-period scan started. This means that the + * grace-period task might sleep. So make sure that if this + * should happen, the first callback posted to this CPU will + * wake up the grace-period task if need be. + */ + + rdp = RCU_DATA_CPU(cpu); + spin_lock_irqsave(&rdp->lock, flags); + rdp->rcu_sched_sleeping = 1; + spin_unlock_irqrestore(&rdp->lock, flags); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -988,31 +1133,196 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) *rdp->nexttail = head; rdp->nexttail = &head->next; RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); - spin_unlock(&rdp->lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&rdp->lock, flags); } EXPORT_SYMBOL_GPL(call_rcu); +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + struct rcu_data *rdp; + int wake_gp = 0; + + head->func = func; + head->next = NULL; + local_irq_save(flags); + rdp = RCU_DATA_ME(); + spin_lock(&rdp->lock); + *rdp->nextschedtail = head; + rdp->nextschedtail = &head->next; + if (rdp->rcu_sched_sleeping) { + + /* Grace-period processing might be sleeping... */ + + rdp->rcu_sched_sleeping = 0; + wake_gp = 1; + } + spin_unlock_irqrestore(&rdp->lock, flags); + if (wake_gp) { + + /* Wake up grace-period processing, unless someone beat us. */ + + spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); + if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping) + wake_gp = 0; + rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping; + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + if (wake_gp) + wake_up_interruptible(&rcu_ctrlblk.sched_wq); + } +} +EXPORT_SYMBOL_GPL(call_rcu_sched); + /* * Wait until all currently running preempt_disable() code segments * (including hardware-irq-disable segments) complete. Note that * in -rt this does -not- necessarily result in all currently executing * interrupt -handlers- having completed. */ -void __synchronize_sched(void) +synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched) +EXPORT_SYMBOL_GPL(__synchronize_sched); + +/* + * kthread function that manages call_rcu_sched grace periods. + */ +static int rcu_sched_grace_period(void *arg) { - cpumask_t oldmask; + int couldsleep; /* might sleep after current pass. */ + int couldsleepnext = 0; /* might sleep after next pass. */ int cpu; + unsigned long flags; + struct rcu_data *rdp; + int ret; - if (sched_getaffinity(0, &oldmask) < 0) - oldmask = cpu_possible_map; - for_each_online_cpu(cpu) { - sched_setaffinity(0, &cpumask_of_cpu(cpu)); - schedule(); - } - sched_setaffinity(0, &oldmask); + /* + * Each pass through the following loop handles one + * rcu_sched grace period cycle. + */ + do { + /* Save each CPU's current state. */ + + for_each_online_cpu(cpu) { + dyntick_save_progress_counter_sched(cpu); + save_qsctr_sched(cpu); + } + + /* + * Sleep for about an RCU grace-period's worth to + * allow better batching and to consume less CPU. + */ + schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME); + + /* + * If there was nothing to do last time, prepare to + * sleep at the end of the current grace period cycle. + */ + couldsleep = couldsleepnext; + couldsleepnext = 1; + if (couldsleep) { + spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); + rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep; + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + } + + /* + * Wait on each CPU in turn to have either visited + * a quiescent state or been in dynticks-idle mode. + */ + for_each_online_cpu(cpu) { + while (rcu_qsctr_inc_needed(cpu) && + rcu_qsctr_inc_needed_dyntick(cpu)) { + /* resched_cpu(cpu); @@@ */ + schedule_timeout_interruptible(1); + } + } + + /* Advance callbacks for each CPU. */ + + for_each_online_cpu(cpu) { + + rdp = RCU_DATA_CPU(cpu); + spin_lock_irqsave(&rdp->lock, flags); + + /* + * We are running on this CPU irq-disabled, so no + * CPU can go offline until we re-enable irqs. + * The current CPU might have already gone + * offline (between the for_each_offline_cpu and + * the spin_lock_irqsave), but in that case all its + * callback lists will be empty, so no harm done. + * + * Advance the callbacks! We share normal RCU's + * donelist, since callbacks are invoked the + * same way in either case. + */ + if (rdp->waitschedlist != NULL) { + *rdp->donetail = rdp->waitschedlist; + rdp->donetail = rdp->waitschedtail; + + /* + * Next rcu_check_callbacks() will + * do the required raise_softirq(). + */ + } + if (rdp->nextschedlist != NULL) { + rdp->waitschedlist = rdp->nextschedlist; + rdp->waitschedtail = rdp->nextschedtail; + couldsleep = 0; + couldsleepnext = 0; + } else { + rdp->waitschedlist = NULL; + rdp->waitschedtail = &rdp->waitschedlist; + } + rdp->nextschedlist = NULL; + rdp->nextschedtail = &rdp->nextschedlist; + + /* Mark sleep intention. */ + + rdp->rcu_sched_sleeping = couldsleep; + + spin_unlock_irqrestore(&rdp->lock, flags); + } + + /* If we saw callbacks on the last scan, go deal with them. */ + + if (!couldsleep) + continue; + + /* Attempt to block... */ + + spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); + if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) { + + /* + * Someone posted a callback after we scanned. + * Go take care of it. + */ + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + couldsleepnext = 0; + continue; + } + + /* Block until the next person posts a callback. */ + + rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + ret = 0; + __wait_event_interruptible(rcu_ctrlblk.sched_wq, + rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, + ret); + + /* + * Signals would prevent us from sleeping, and we cannot + * do much with them in any case. So flush them. + */ + if (ret) + flush_signals(current); + couldsleepnext = 0; + + } while (!kthread_should_stop()); + + return (0); } -EXPORT_SYMBOL_GPL(__synchronize_sched); /* * Check to see if any future RCU-related work will need to be done @@ -1029,7 +1339,9 @@ int rcu_needs_cpu(int cpu) return (rdp->donelist != NULL || !!rdp->waitlistcount || - rdp->nextlist != NULL); + rdp->nextlist != NULL || + rdp->nextschedlist != NULL || + rdp->waitschedlist != NULL); } int rcu_pending(int cpu) @@ -1040,7 +1352,9 @@ int rcu_pending(int cpu) if (rdp->donelist != NULL || !!rdp->waitlistcount || - rdp->nextlist != NULL) + rdp->nextlist != NULL || + rdp->nextschedlist != NULL || + rdp->waitschedlist != NULL) return 1; /* The RCU core needs an acknowledgement from this CPU. */ @@ -1107,6 +1421,11 @@ void __init __rcu_init(void) rdp->donetail = &rdp->donelist; rdp->rcu_flipctr[0] = 0; rdp->rcu_flipctr[1] = 0; + rdp->nextschedlist = NULL; + rdp->nextschedtail = &rdp->nextschedlist; + rdp->waitschedlist = NULL; + rdp->waitschedtail = &rdp->waitschedlist; + rdp->rcu_sched_sleeping = 0; } register_cpu_notifier(&rcu_nb); @@ -1129,11 +1448,15 @@ void __init __rcu_init(void) } /* - * Deprecated, use synchronize_rcu() or synchronize_sched() instead. + * Late-boot-time RCU initialization that must wait until after scheduler + * has been initialized. */ -void synchronize_kernel(void) +void __init rcu_init_sched(void) { - synchronize_rcu(); + rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period, + NULL, + "rcu_sched_grace_period"); + WARN_ON(IS_ERR(rcu_sched_grace_period_task)); } #ifdef CONFIG_RCU_TRACE diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c index 49ac4947af24..5edf82c34bbc 100644 --- a/kernel/rcupreempt_trace.c +++ b/kernel/rcupreempt_trace.c @@ -38,7 +38,6 @@ #include <linux/moduleparam.h> #include <linux/percpu.h> #include <linux/notifier.h> -#include <linux/rcupdate.h> #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/rcupreempt_trace.h> diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c deleted file mode 100644 index 33acc424667e..000000000000 --- a/kernel/rcutorture.c +++ /dev/null @@ -1,999 +0,0 @@ -/* - * Read-Copy Update module-based torture test facility - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2005, 2006 - * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> - * Josh Triplett <josh@freedesktop.org> - * - * See also: Documentation/RCU/torture.txt - */ -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/kthread.h> -#include <linux/err.h> -#include <linux/spinlock.h> -#include <linux/smp.h> -#include <linux/rcupdate.h> -#include <linux/interrupt.h> -#include <linux/sched.h> -#include <asm/atomic.h> -#include <linux/bitops.h> -#include <linux/completion.h> -#include <linux/moduleparam.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/freezer.h> -#include <linux/cpu.h> -#include <linux/delay.h> -#include <linux/byteorder/swabb.h> -#include <linux/stat.h> -#include <linux/srcu.h> -#include <linux/slab.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " - "Josh Triplett <josh@freedesktop.org>"); - -static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ -static int nfakewriters = 4; /* # fake writer threads */ -static int stat_interval; /* Interval between stats, in seconds. */ - /* Defaults to "only at end of test". */ -static int verbose; /* Print more debug info. */ -static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ -static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ -static char *torture_type = "rcu"; /* What RCU implementation to torture. */ - -module_param(nreaders, int, 0444); -MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -module_param(nfakewriters, int, 0444); -MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -module_param(stat_interval, int, 0444); -MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); -module_param(verbose, bool, 0444); -MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); -module_param(test_no_idle_hz, bool, 0444); -MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); -module_param(shuffle_interval, int, 0444); -MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -module_param(torture_type, charp, 0444); -MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); - -#define TORTURE_FLAG "-torture:" -#define PRINTK_STRING(s) \ - do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_STRING(s) \ - do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_ERRSTRING(s) \ - do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) - -static char printk_buf[4096]; - -static int nrealreaders; -static struct task_struct *writer_task; -static struct task_struct **fakewriter_tasks; -static struct task_struct **reader_tasks; -static struct task_struct *stats_task; -static struct task_struct *shuffler_task; - -#define RCU_TORTURE_PIPE_LEN 10 - -struct rcu_torture { - struct rcu_head rtort_rcu; - int rtort_pipe_count; - struct list_head rtort_free; - int rtort_mbtest; -}; - -static int fullstop = 0; /* stop generating callbacks at test end. */ -static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture *rcu_torture_current = NULL; -static long rcu_torture_current_version = 0; -static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; -static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = - { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = - { 0 }; -static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; -static atomic_t n_rcu_torture_alloc; -static atomic_t n_rcu_torture_alloc_fail; -static atomic_t n_rcu_torture_free; -static atomic_t n_rcu_torture_mberror; -static atomic_t n_rcu_torture_error; -static struct list_head rcu_torture_removed; - -/* - * Allocate an element from the rcu_tortures pool. - */ -static struct rcu_torture * -rcu_torture_alloc(void) -{ - struct list_head *p; - - spin_lock_bh(&rcu_torture_lock); - if (list_empty(&rcu_torture_freelist)) { - atomic_inc(&n_rcu_torture_alloc_fail); - spin_unlock_bh(&rcu_torture_lock); - return NULL; - } - atomic_inc(&n_rcu_torture_alloc); - p = rcu_torture_freelist.next; - list_del_init(p); - spin_unlock_bh(&rcu_torture_lock); - return container_of(p, struct rcu_torture, rtort_free); -} - -/* - * Free an element to the rcu_tortures pool. - */ -static void -rcu_torture_free(struct rcu_torture *p) -{ - atomic_inc(&n_rcu_torture_free); - spin_lock_bh(&rcu_torture_lock); - list_add_tail(&p->rtort_free, &rcu_torture_freelist); - spin_unlock_bh(&rcu_torture_lock); -} - -struct rcu_random_state { - unsigned long rrs_state; - long rrs_count; -}; - -#define RCU_RANDOM_MULT 39916801 /* prime */ -#define RCU_RANDOM_ADD 479001701 /* prime */ -#define RCU_RANDOM_REFRESH 10000 - -#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } - -/* - * Crude but fast random-number generator. Uses a linear congruential - * generator, with occasional help from cpu_clock(). - */ -static unsigned long -rcu_random(struct rcu_random_state *rrsp) -{ - if (--rrsp->rrs_count < 0) { - rrsp->rrs_state += - (unsigned long)cpu_clock(raw_smp_processor_id()); - rrsp->rrs_count = RCU_RANDOM_REFRESH; - } - rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; - return swahw32(rrsp->rrs_state); -} - -/* - * Operations vector for selecting different types of tests. - */ - -struct rcu_torture_ops { - void (*init)(void); - void (*cleanup)(void); - int (*readlock)(void); - void (*readdelay)(struct rcu_random_state *rrsp); - void (*readunlock)(int idx); - int (*completed)(void); - void (*deferredfree)(struct rcu_torture *p); - void (*sync)(void); - int (*stats)(char *page); - char *name; -}; -static struct rcu_torture_ops *cur_ops = NULL; - -/* - * Definitions for rcu torture testing. - */ - -static int rcu_torture_read_lock(void) __acquires(RCU) -{ - rcu_read_lock(); - return 0; -} - -static void rcu_read_delay(struct rcu_random_state *rrsp) -{ - long delay; - const long longdelay = 200; - - /* We want there to be long-running readers, but not all the time. */ - - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); - if (!delay) - udelay(longdelay); -} - -static void rcu_torture_read_unlock(int idx) __releases(RCU) -{ - rcu_read_unlock(); -} - -static int rcu_torture_completed(void) -{ - return rcu_batches_completed(); -} - -static void -rcu_torture_cb(struct rcu_head *p) -{ - int i; - struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - - if (fullstop) { - /* Test is ending, just drop callbacks on the floor. */ - /* The next initialization will pick up the pieces. */ - return; - } - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - rcu_torture_free(rp); - } else - cur_ops->deferredfree(rp); -} - -static void rcu_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu(&p->rtort_rcu, rcu_torture_cb); -} - -static struct rcu_torture_ops rcu_ops = { - .init = NULL, - .cleanup = NULL, - .readlock = rcu_torture_read_lock, - .readdelay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferredfree = rcu_torture_deferred_free, - .sync = synchronize_rcu, - .stats = NULL, - .name = "rcu" -}; - -static void rcu_sync_torture_deferred_free(struct rcu_torture *p) -{ - int i; - struct rcu_torture *rp; - struct rcu_torture *rp1; - - cur_ops->sync(); - list_add(&p->rtort_free, &rcu_torture_removed); - list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - list_del(&rp->rtort_free); - rcu_torture_free(rp); - } - } -} - -static void rcu_sync_torture_init(void) -{ - INIT_LIST_HEAD(&rcu_torture_removed); -} - -static struct rcu_torture_ops rcu_sync_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = rcu_torture_read_lock, - .readdelay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu, - .stats = NULL, - .name = "rcu_sync" -}; - -/* - * Definitions for rcu_bh torture testing. - */ - -static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) -{ - rcu_read_lock_bh(); - return 0; -} - -static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) -{ - rcu_read_unlock_bh(); -} - -static int rcu_bh_torture_completed(void) -{ - return rcu_batches_completed_bh(); -} - -static void rcu_bh_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); -} - -struct rcu_bh_torture_synchronize { - struct rcu_head head; - struct completion completion; -}; - -static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head) -{ - struct rcu_bh_torture_synchronize *rcu; - - rcu = container_of(head, struct rcu_bh_torture_synchronize, head); - complete(&rcu->completion); -} - -static void rcu_bh_torture_synchronize(void) -{ - struct rcu_bh_torture_synchronize rcu; - - init_completion(&rcu.completion); - call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); - wait_for_completion(&rcu.completion); -} - -static struct rcu_torture_ops rcu_bh_ops = { - .init = NULL, - .cleanup = NULL, - .readlock = rcu_bh_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferredfree = rcu_bh_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, - .stats = NULL, - .name = "rcu_bh" -}; - -static struct rcu_torture_ops rcu_bh_sync_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = rcu_bh_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, - .stats = NULL, - .name = "rcu_bh_sync" -}; - -/* - * Definitions for srcu torture testing. - */ - -static struct srcu_struct srcu_ctl; - -static void srcu_torture_init(void) -{ - init_srcu_struct(&srcu_ctl); - rcu_sync_torture_init(); -} - -static void srcu_torture_cleanup(void) -{ - synchronize_srcu(&srcu_ctl); - cleanup_srcu_struct(&srcu_ctl); -} - -static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) -{ - return srcu_read_lock(&srcu_ctl); -} - -static void srcu_read_delay(struct rcu_random_state *rrsp) -{ - long delay; - const long uspertick = 1000000 / HZ; - const long longdelay = 10; - - /* We want there to be long-running readers, but not all the time. */ - - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); - if (!delay) - schedule_timeout_interruptible(longdelay); -} - -static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) -{ - srcu_read_unlock(&srcu_ctl, idx); -} - -static int srcu_torture_completed(void) -{ - return srcu_batches_completed(&srcu_ctl); -} - -static void srcu_torture_synchronize(void) -{ - synchronize_srcu(&srcu_ctl); -} - -static int srcu_torture_stats(char *page) -{ - int cnt = 0; - int cpu; - int idx = srcu_ctl.completed & 0x1; - - cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", - torture_type, TORTURE_FLAG, idx); - for_each_possible_cpu(cpu) { - cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); - } - cnt += sprintf(&page[cnt], "\n"); - return cnt; -} - -static struct rcu_torture_ops srcu_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, - .readlock = srcu_torture_read_lock, - .readdelay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize, - .stats = srcu_torture_stats, - .name = "srcu" -}; - -/* - * Definitions for sched torture testing. - */ - -static int sched_torture_read_lock(void) -{ - preempt_disable(); - return 0; -} - -static void sched_torture_read_unlock(int idx) -{ - preempt_enable(); -} - -static int sched_torture_completed(void) -{ - return 0; -} - -static void sched_torture_synchronize(void) -{ - synchronize_sched(); -} - -static struct rcu_torture_ops sched_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = sched_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = sched_torture_synchronize, - .stats = NULL, - .name = "sched" -}; - -/* - * RCU torture writer kthread. Repeatedly substitutes a new structure - * for that pointed to by rcu_torture_current, freeing the old structure - * after a series of grace periods (the "pipeline"). - */ -static int -rcu_torture_writer(void *arg) -{ - int i; - long oldbatch = rcu_batches_completed(); - struct rcu_torture *rp; - struct rcu_torture *old_rp; - static DEFINE_RCU_RANDOM(rand); - - VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); - set_user_nice(current, 19); - - do { - schedule_timeout_uninterruptible(1); - if ((rp = rcu_torture_alloc()) == NULL) - continue; - rp->rtort_pipe_count = 0; - udelay(rcu_random(&rand) & 0x3ff); - old_rp = rcu_torture_current; - rp->rtort_mbtest = 1; - rcu_assign_pointer(rcu_torture_current, rp); - smp_wmb(); - if (old_rp) { - i = old_rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - old_rp->rtort_pipe_count++; - cur_ops->deferredfree(old_rp); - } - rcu_torture_current_version++; - oldbatch = cur_ops->completed(); - } while (!kthread_should_stop() && !fullstop); - VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * RCU torture fake writer kthread. Repeatedly calls sync, with a random - * delay between calls. - */ -static int -rcu_torture_fakewriter(void *arg) -{ - DEFINE_RCU_RANDOM(rand); - - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); - set_user_nice(current, 19); - - do { - schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); - udelay(rcu_random(&rand) & 0x3ff); - cur_ops->sync(); - } while (!kthread_should_stop() && !fullstop); - - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, - * incrementing the corresponding element of the pipeline array. The - * counter in the element should never be greater than 1, otherwise, the - * RCU implementation is broken. - */ -static int -rcu_torture_reader(void *arg) -{ - int completed; - int idx; - DEFINE_RCU_RANDOM(rand); - struct rcu_torture *p; - int pipe_count; - - VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); - set_user_nice(current, 19); - - do { - idx = cur_ops->readlock(); - completed = cur_ops->completed(); - p = rcu_dereference(rcu_torture_current); - if (p == NULL) { - /* Wait for rcu_torture_writer to get underway */ - cur_ops->readunlock(idx); - schedule_timeout_interruptible(HZ); - continue; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - cur_ops->readdelay(&rand); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - ++__get_cpu_var(rcu_torture_count)[pipe_count]; - completed = cur_ops->completed() - completed; - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - ++__get_cpu_var(rcu_torture_batch)[completed]; - preempt_enable(); - cur_ops->readunlock(idx); - schedule(); - } while (!kthread_should_stop() && !fullstop); - VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * Create an RCU-torture statistics message in the specified buffer. - */ -static int -rcu_torture_printk(char *page) -{ - int cnt = 0; - int cpu; - int i; - long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; - long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; - - for_each_possible_cpu(cpu) { - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; - batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; - } - } - for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { - if (pipesummary[i] != 0) - break; - } - cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], - "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " - "rtmbe: %d", - rcu_torture_current, - rcu_torture_current_version, - list_empty(&rcu_torture_freelist), - atomic_read(&n_rcu_torture_alloc), - atomic_read(&n_rcu_torture_alloc_fail), - atomic_read(&n_rcu_torture_free), - atomic_read(&n_rcu_torture_mberror)); - if (atomic_read(&n_rcu_torture_mberror) != 0) - cnt += sprintf(&page[cnt], " !!!"); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - if (i > 1) { - cnt += sprintf(&page[cnt], "!!! "); - atomic_inc(&n_rcu_torture_error); - } - cnt += sprintf(&page[cnt], "Reader Pipe: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], "Reader Batch: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], "Free-Block Circulation: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - cnt += sprintf(&page[cnt], " %d", - atomic_read(&rcu_torture_wcount[i])); - } - cnt += sprintf(&page[cnt], "\n"); - if (cur_ops->stats) - cnt += cur_ops->stats(&page[cnt]); - return cnt; -} - -/* - * Print torture statistics. Caller must ensure that there is only - * one call to this function at a given time!!! This is normally - * accomplished by relying on the module system to only have one copy - * of the module loaded, and then by giving the rcu_torture_stats - * kthread full control (or the init/cleanup functions when rcu_torture_stats - * thread is not running). - */ -static void -rcu_torture_stats_print(void) -{ - int cnt; - - cnt = rcu_torture_printk(printk_buf); - printk(KERN_ALERT "%s", printk_buf); -} - -/* - * Periodically prints torture statistics, if periodic statistics printing - * was specified via the stat_interval module parameter. - * - * No need to worry about fullstop here, since this one doesn't reference - * volatile state or register callbacks. - */ -static int -rcu_torture_stats(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); - do { - schedule_timeout_interruptible(stat_interval * HZ); - rcu_torture_stats_print(); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); - return 0; -} - -static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ - -/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case - * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. - */ -static void rcu_torture_shuffle_tasks(void) -{ - cpumask_t tmp_mask; - int i; - - cpus_setall(tmp_mask); - get_online_cpus(); - - /* No point in shuffling if there is only one online CPU (ex: UP) */ - if (num_online_cpus() == 1) { - put_online_cpus(); - return; - } - - if (rcu_idle_cpu != -1) - cpu_clear(rcu_idle_cpu, tmp_mask); - - set_cpus_allowed_ptr(current, &tmp_mask); - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - if (reader_tasks[i]) - set_cpus_allowed_ptr(reader_tasks[i], - &tmp_mask); - } - - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) - if (fakewriter_tasks[i]) - set_cpus_allowed_ptr(fakewriter_tasks[i], - &tmp_mask); - } - - if (writer_task) - set_cpus_allowed_ptr(writer_task, &tmp_mask); - - if (stats_task) - set_cpus_allowed_ptr(stats_task, &tmp_mask); - - if (rcu_idle_cpu == -1) - rcu_idle_cpu = num_online_cpus() - 1; - else - rcu_idle_cpu--; - - put_online_cpus(); -} - -/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the - * system to become idle at a time and cut off its timer ticks. This is meant - * to test the support for such tickless idle CPU in RCU. - */ -static int -rcu_torture_shuffle(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); - do { - schedule_timeout_interruptible(shuffle_interval * HZ); - rcu_torture_shuffle_tasks(); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); - return 0; -} - -static inline void -rcu_torture_print_module_parms(char *tag) -{ - printk(KERN_ALERT "%s" TORTURE_FLAG - "--- %s: nreaders=%d nfakewriters=%d " - "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval = %d\n", - torture_type, tag, nrealreaders, nfakewriters, - stat_interval, verbose, test_no_idle_hz, shuffle_interval); -} - -static void -rcu_torture_cleanup(void) -{ - int i; - - fullstop = 1; - if (shuffler_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); - kthread_stop(shuffler_task); - } - shuffler_task = NULL; - - if (writer_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); - kthread_stop(writer_task); - } - writer_task = NULL; - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) { - if (reader_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_reader task"); - kthread_stop(reader_tasks[i]); - } - reader_tasks[i] = NULL; - } - kfree(reader_tasks); - reader_tasks = NULL; - } - rcu_torture_current = NULL; - - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) { - if (fakewriter_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_fakewriter task"); - kthread_stop(fakewriter_tasks[i]); - } - fakewriter_tasks[i] = NULL; - } - kfree(fakewriter_tasks); - fakewriter_tasks = NULL; - } - - if (stats_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); - kthread_stop(stats_task); - } - stats_task = NULL; - - /* Wait for all RCU callbacks to fire. */ - rcu_barrier(); - - rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ - - if (cur_ops->cleanup) - cur_ops->cleanup(); - if (atomic_read(&n_rcu_torture_error)) - rcu_torture_print_module_parms("End of test: FAILURE"); - else - rcu_torture_print_module_parms("End of test: SUCCESS"); -} - -static int __init -rcu_torture_init(void) -{ - int i; - int cpu; - int firsterr = 0; - static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, - &srcu_ops, &sched_ops, }; - - /* Process args and tell the world that the torturer is on the job. */ - for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { - cur_ops = torture_ops[i]; - if (strcmp(torture_type, cur_ops->name) == 0) - break; - } - if (i == ARRAY_SIZE(torture_ops)) { - printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", - torture_type); - return (-EINVAL); - } - if (cur_ops->init) - cur_ops->init(); /* no "goto unwind" prior to this point!!! */ - - if (nreaders >= 0) - nrealreaders = nreaders; - else - nrealreaders = 2 * num_online_cpus(); - rcu_torture_print_module_parms("Start of test"); - fullstop = 0; - - /* Set up the freelist. */ - - INIT_LIST_HEAD(&rcu_torture_freelist); - for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { - rcu_tortures[i].rtort_mbtest = 0; - list_add_tail(&rcu_tortures[i].rtort_free, - &rcu_torture_freelist); - } - - /* Initialize the statistics so that each run gets its own numbers. */ - - rcu_torture_current = NULL; - rcu_torture_current_version = 0; - atomic_set(&n_rcu_torture_alloc, 0); - atomic_set(&n_rcu_torture_alloc_fail, 0); - atomic_set(&n_rcu_torture_free, 0); - atomic_set(&n_rcu_torture_mberror, 0); - atomic_set(&n_rcu_torture_error, 0); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - atomic_set(&rcu_torture_wcount[i], 0); - for_each_possible_cpu(cpu) { - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - per_cpu(rcu_torture_count, cpu)[i] = 0; - per_cpu(rcu_torture_batch, cpu)[i] = 0; - } - } - - /* Start up the kthreads. */ - - VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); - writer_task = kthread_run(rcu_torture_writer, NULL, - "rcu_torture_writer"); - if (IS_ERR(writer_task)) { - firsterr = PTR_ERR(writer_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); - writer_task = NULL; - goto unwind; - } - fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), - GFP_KERNEL); - if (fakewriter_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nfakewriters; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); - fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, - "rcu_torture_fakewriter"); - if (IS_ERR(fakewriter_tasks[i])) { - firsterr = PTR_ERR(fakewriter_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); - fakewriter_tasks[i] = NULL; - goto unwind; - } - } - reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), - GFP_KERNEL); - if (reader_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nrealreaders; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); - reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, - "rcu_torture_reader"); - if (IS_ERR(reader_tasks[i])) { - firsterr = PTR_ERR(reader_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); - reader_tasks[i] = NULL; - goto unwind; - } - } - if (stat_interval > 0) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); - stats_task = kthread_run(rcu_torture_stats, NULL, - "rcu_torture_stats"); - if (IS_ERR(stats_task)) { - firsterr = PTR_ERR(stats_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); - stats_task = NULL; - goto unwind; - } - } - if (test_no_idle_hz) { - rcu_idle_cpu = num_online_cpus() - 1; - /* Create the shuffler thread */ - shuffler_task = kthread_run(rcu_torture_shuffle, NULL, - "rcu_torture_shuffle"); - if (IS_ERR(shuffler_task)) { - firsterr = PTR_ERR(shuffler_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); - shuffler_task = NULL; - goto unwind; - } - } - return 0; - -unwind: - rcu_torture_cleanup(); - return firsterr; -} - -module_init(rcu_torture_init); -module_exit(rcu_torture_cleanup); diff --git a/kernel/relay.c b/kernel/relay.c index bc24dcdc570f..7de644cdec43 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1191,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in, ret = 0; spliced = 0; - while (len) { + while (len && !spliced) { ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); if (ret < 0) break; diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c deleted file mode 100644 index 092e4c620af9..000000000000 --- a/kernel/rtmutex-tester.c +++ /dev/null @@ -1,442 +0,0 @@ -/* - * RT-Mutex-tester: scriptable tester for rt mutexes - * - * started by Thomas Gleixner: - * - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - */ -#include <linux/kthread.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/smp_lock.h> -#include <linux/spinlock.h> -#include <linux/sysdev.h> -#include <linux/timer.h> -#include <linux/freezer.h> - -#include "rtmutex.h" - -#define MAX_RT_TEST_THREADS 8 -#define MAX_RT_TEST_MUTEXES 8 - -static spinlock_t rttest_lock; -static atomic_t rttest_event; - -struct test_thread_data { - int opcode; - int opdata; - int mutexes[MAX_RT_TEST_MUTEXES]; - int bkl; - int event; - struct sys_device sysdev; -}; - -static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; -static struct task_struct *threads[MAX_RT_TEST_THREADS]; -static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; - -enum test_opcodes { - RTTEST_NOP = 0, - RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ - RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ - RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ - RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ - RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ - RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - RTTEST_LOCKBKL, /* 9 Lock BKL */ - RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ - RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ - RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ - RTTEST_RESET = 99, /* 99 Reset all pending operations */ -}; - -static int handle_op(struct test_thread_data *td, int lockwakeup) -{ - int i, id, ret = -EINVAL; - - switch(td->opcode) { - - case RTTEST_NOP: - return 0; - - case RTTEST_LOCKCONT: - td->mutexes[td->opdata] = 1; - td->event = atomic_add_return(1, &rttest_event); - return 0; - - case RTTEST_RESET: - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { - if (td->mutexes[i] == 4) { - rt_mutex_unlock(&mutexes[i]); - td->mutexes[i] = 0; - } - } - - if (!lockwakeup && td->bkl == 4) { - unlock_kernel(); - td->bkl = 0; - } - return 0; - - case RTTEST_RESETEVENT: - atomic_set(&rttest_event, 0); - return 0; - - default: - if (lockwakeup) - return ret; - } - - switch(td->opcode) { - - case RTTEST_LOCK: - case RTTEST_LOCKNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_lock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 4; - return 0; - - case RTTEST_LOCKINT: - case RTTEST_LOCKINTNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - ret = rt_mutex_lock_interruptible(&mutexes[id], 0); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = ret ? 0 : 4; - return ret ? -EINTR : 0; - - case RTTEST_UNLOCK: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) - return ret; - - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_unlock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 0; - return 0; - - case RTTEST_LOCKBKL: - if (td->bkl) - return 0; - td->bkl = 1; - lock_kernel(); - td->bkl = 4; - return 0; - - case RTTEST_UNLOCKBKL: - if (td->bkl != 4) - break; - unlock_kernel(); - td->bkl = 0; - return 0; - - default: - break; - } - return ret; -} - -/* - * Schedule replacement for rtsem_down(). Only called for threads with - * PF_MUTEX_TESTER set. - * - * This allows us to have finegrained control over the event flow. - * - */ -void schedule_rt_mutex_test(struct rt_mutex *mutex) -{ - int tid, op, dat; - struct test_thread_data *td; - - /* We have to lookup the task */ - for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { - if (threads[tid] == current) - break; - } - - BUG_ON(tid == MAX_RT_TEST_THREADS); - - td = &thread_data[tid]; - - op = td->opcode; - dat = td->opdata; - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - break; - - if (td->mutexes[dat] != 1) - break; - - td->mutexes[dat] = 2; - td->event = atomic_add_return(1, &rttest_event); - break; - - case RTTEST_LOCKBKL: - default: - break; - } - - schedule(); - - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 3; - td->event = atomic_add_return(1, &rttest_event); - break; - - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 1; - td->event = atomic_add_return(1, &rttest_event); - return; - - case RTTEST_LOCKBKL: - return; - default: - return; - } - - td->opcode = 0; - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - int ret; - - set_current_state(TASK_RUNNING); - ret = handle_op(td, 1); - set_current_state(TASK_INTERRUPTIBLE); - if (td->opcode == RTTEST_LOCKCONT) - break; - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - } - - /* Restore previous command and data */ - td->opcode = op; - td->opdata = dat; -} - -static int test_func(void *data) -{ - struct test_thread_data *td = data; - int ret; - - current->flags |= PF_MUTEX_TESTER; - set_freezable(); - allow_signal(SIGHUP); - - for(;;) { - - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - set_current_state(TASK_RUNNING); - ret = handle_op(td, 0); - set_current_state(TASK_INTERRUPTIBLE); - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - try_to_freeze(); - - if (signal_pending(current)) - flush_signals(current); - - if(kthread_should_stop()) - break; - } - return 0; -} - -/** - * sysfs_test_command - interface for test commands - * @dev: thread reference - * @buf: command for actual step - * @count: length of buffer - * - * command syntax: - * - * opcode:data - */ -static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, - size_t count) -{ - struct sched_param schedpar; - struct test_thread_data *td; - char cmdbuf[32]; - int op, dat, tid, ret; - - td = container_of(dev, struct test_thread_data, sysdev); - tid = td->sysdev.id; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(cmdbuf)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; - if (count < 1) - return -EINVAL; - - memcpy(cmdbuf, buf, count); - cmdbuf[count] = 0; - - if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) - return -EINVAL; - - switch (op) { - case RTTEST_SCHEDOT: - schedpar.sched_priority = 0; - ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); - if (ret) - return ret; - set_user_nice(current, 0); - break; - - case RTTEST_SCHEDRT: - schedpar.sched_priority = dat; - ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); - if (ret) - return ret; - break; - - case RTTEST_SIGNAL: - send_sig(SIGHUP, threads[tid], 0); - break; - - default: - if (td->opcode > 0) - return -EBUSY; - td->opdata = dat; - td->opcode = op; - wake_up_process(threads[tid]); - } - - return count; -} - -/** - * sysfs_test_status - sysfs interface for rt tester - * @dev: thread to query - * @buf: char buffer to be filled with thread status info - */ -static ssize_t sysfs_test_status(struct sys_device *dev, char *buf) -{ - struct test_thread_data *td; - struct task_struct *tsk; - char *curr = buf; - int i; - - td = container_of(dev, struct test_thread_data, sysdev); - tsk = threads[td->sysdev.id]; - - spin_lock(&rttest_lock); - - curr += sprintf(curr, - "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", - td->opcode, td->event, tsk->state, - (MAX_RT_PRIO - 1) - tsk->prio, - (MAX_RT_PRIO - 1) - tsk->normal_prio, - tsk->pi_blocked_on, td->bkl); - - for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) - curr += sprintf(curr, "%d", td->mutexes[i]); - - spin_unlock(&rttest_lock); - - curr += sprintf(curr, ", T: %p, R: %p\n", tsk, - mutexes[td->sysdev.id].owner); - - return curr - buf; -} - -static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); -static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); - -static struct sysdev_class rttest_sysclass = { - .name = "rttest", -}; - -static int init_test_thread(int id) -{ - thread_data[id].sysdev.cls = &rttest_sysclass; - thread_data[id].sysdev.id = id; - - threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); - if (IS_ERR(threads[id])) - return PTR_ERR(threads[id]); - - return sysdev_register(&thread_data[id].sysdev); -} - -static int init_rttest(void) -{ - int ret, i; - - spin_lock_init(&rttest_lock); - - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) - rt_mutex_init(&mutexes[i]); - - ret = sysdev_class_register(&rttest_sysclass); - if (ret) - return ret; - - for (i = 0; i < MAX_RT_TEST_THREADS; i++) { - ret = init_test_thread(i); - if (ret) - break; - ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); - if (ret) - break; - ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); - if (ret) - break; - } - - printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); - - return ret; -} - -device_initcall(init_rttest); diff --git a/kernel/sched.c b/kernel/sched.c index cfa222a91539..fc9ba904d5b8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -70,6 +70,7 @@ #include <linux/bootmem.h> #include <linux/debugfs.h> #include <linux/ctype.h> +#include <linux/ftrace.h> #include <asm/tlb.h> #include <asm/irq_regs.h> @@ -136,7 +137,7 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) static inline int rt_policy(int policy) { - if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) + if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) return 1; return 0; } @@ -398,43 +399,6 @@ struct cfs_rq { */ struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ - -#ifdef CONFIG_SMP - unsigned long task_weight; - unsigned long shares; - /* - * We need space to build a sched_domain wide view of the full task - * group tree, in order to avoid depending on dynamic memory allocation - * during the load balancing we place this in the per cpu task group - * hierarchy. This limits the load balancing to one instance per cpu, - * but more should not be needed anyway. - */ - struct aggregate_struct { - /* - * load = weight(cpus) * f(tg) - * - * Where f(tg) is the recursive weight fraction assigned to - * this group. - */ - unsigned long load; - - /* - * part of the group weight distributed to this span. - */ - unsigned long shares; - - /* - * The sum of all runqueue weights within this span. - */ - unsigned long rq_weight; - - /* - * Weight contributed by tasks; this is the part we can - * influence by moving tasks around. - */ - unsigned long task_weight; - } aggregate; -#endif #endif }; @@ -632,6 +596,8 @@ static inline void update_rq_clock(struct rq *rq) rq->clock = sched_clock_cpu(cpu_of(rq)); } +#include "sched_trace.h" + /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -641,6 +607,24 @@ static inline void update_rq_clock(struct rq *rq) # define const_debug static const #endif +/** + * runqueue_is_locked + * + * Returns true if the current cpu runqueue is locked. + * This interface allows printk to be called with the runqueue lock + * held and know whether or not it is OK to wake up the klogd. + */ +int runqueue_is_locked(void) +{ + int cpu = get_cpu(); + struct rq *rq = cpu_rq(cpu); + int ret; + + ret = spin_is_locked(&rq->lock); + put_cpu(); + return ret; +} + /* * Debugging: various feature bits */ @@ -865,12 +849,12 @@ static unsigned long long __cpu_clock(int cpu) * For kernel-internal use: high-speed (but slightly incorrect) per-cpu * clock constructed from sched_clock(): */ -unsigned long long cpu_clock(int cpu) +unsigned long long notrace cpu_clock(int cpu) { unsigned long long prev_cpu_time, time, delta_time; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); prev_cpu_time = per_cpu(prev_cpu_time, cpu); time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); delta_time = time-prev_cpu_time; @@ -879,7 +863,7 @@ unsigned long long cpu_clock(int cpu) time = __sync_cpu_clock(time, cpu); per_cpu(prev_cpu_time, cpu) = time; } - local_irq_restore(flags); + raw_local_irq_restore(flags); return time; } @@ -1368,9 +1352,6 @@ static void __resched_task(struct task_struct *p, int tif_bit) */ #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) -/* - * delta *= weight / lw - */ static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, struct load_weight *lw) @@ -1393,6 +1374,12 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } +static inline unsigned long +calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) +{ + return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); +} + static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -1505,326 +1492,6 @@ static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* - * Group load balancing. - * - * We calculate a few balance domain wide aggregate numbers; load and weight. - * Given the pictures below, and assuming each item has equal weight: - * - * root 1 - thread - * / | \ A - group - * A 1 B - * /|\ / \ - * C 2 D 3 4 - * | | - * 5 6 - * - * load: - * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, - * which equals 1/9-th of the total load. - * - * shares: - * The weight of this group on the selected cpus. - * - * rq_weight: - * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while - * B would get 2. - * - * task_weight: - * Part of the rq_weight contributed by tasks; all groups except B would - * get 1, B gets 2. - */ - -static inline struct aggregate_struct * -aggregate(struct task_group *tg, struct sched_domain *sd) -{ - return &tg->cfs_rq[sd->first_cpu]->aggregate; -} - -typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); - -/* - * Iterate the full tree, calling @down when first entering a node and @up when - * leaving it for the final time. - */ -static -void aggregate_walk_tree(aggregate_func down, aggregate_func up, - struct sched_domain *sd) -{ - struct task_group *parent, *child; - - rcu_read_lock(); - parent = &root_task_group; -down: - (*down)(parent, sd); - list_for_each_entry_rcu(child, &parent->children, siblings) { - parent = child; - goto down; - -up: - continue; - } - (*up)(parent, sd); - - child = parent; - parent = parent->parent; - if (parent) - goto up; - rcu_read_unlock(); -} - -/* - * Calculate the aggregate runqueue weight. - */ -static -void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) -{ - unsigned long rq_weight = 0; - unsigned long task_weight = 0; - int i; - - for_each_cpu_mask(i, sd->span) { - rq_weight += tg->cfs_rq[i]->load.weight; - task_weight += tg->cfs_rq[i]->task_weight; - } - - aggregate(tg, sd)->rq_weight = rq_weight; - aggregate(tg, sd)->task_weight = task_weight; -} - -/* - * Compute the weight of this group on the given cpus. - */ -static -void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) -{ - unsigned long shares = 0; - int i; - - for_each_cpu_mask(i, sd->span) - shares += tg->cfs_rq[i]->shares; - - if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) - shares = tg->shares; - - aggregate(tg, sd)->shares = shares; -} - -/* - * Compute the load fraction assigned to this group, relies on the aggregate - * weight and this group's parent's load, i.e. top-down. - */ -static -void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) -{ - unsigned long load; - - if (!tg->parent) { - int i; - - load = 0; - for_each_cpu_mask(i, sd->span) - load += cpu_rq(i)->load.weight; - - } else { - load = aggregate(tg->parent, sd)->load; - - /* - * shares is our weight in the parent's rq so - * shares/parent->rq_weight gives our fraction of the load - */ - load *= aggregate(tg, sd)->shares; - load /= aggregate(tg->parent, sd)->rq_weight + 1; - } - - aggregate(tg, sd)->load = load; -} - -static void __set_se_shares(struct sched_entity *se, unsigned long shares); - -/* - * Calculate and set the cpu's group shares. - */ -static void -__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, - int tcpu) -{ - int boost = 0; - unsigned long shares; - unsigned long rq_weight; - - if (!tg->se[tcpu]) - return; - - rq_weight = tg->cfs_rq[tcpu]->load.weight; - - /* - * If there are currently no tasks on the cpu pretend there is one of - * average load so that when a new task gets to run here it will not - * get delayed by group starvation. - */ - if (!rq_weight) { - boost = 1; - rq_weight = NICE_0_LOAD; - } - - /* - * \Sum shares * rq_weight - * shares = ----------------------- - * \Sum rq_weight - * - */ - shares = aggregate(tg, sd)->shares * rq_weight; - shares /= aggregate(tg, sd)->rq_weight + 1; - - /* - * record the actual number of shares, not the boosted amount. - */ - tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; - - if (shares < MIN_SHARES) - shares = MIN_SHARES; - else if (shares > MAX_SHARES) - shares = MAX_SHARES; - - __set_se_shares(tg->se[tcpu], shares); -} - -/* - * Re-adjust the weights on the cpu the task came from and on the cpu the - * task went to. - */ -static void -__move_group_shares(struct task_group *tg, struct sched_domain *sd, - int scpu, int dcpu) -{ - unsigned long shares; - - shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; - - __update_group_shares_cpu(tg, sd, scpu); - __update_group_shares_cpu(tg, sd, dcpu); - - /* - * ensure we never loose shares due to rounding errors in the - * above redistribution. - */ - shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; - if (shares) - tg->cfs_rq[dcpu]->shares += shares; -} - -/* - * Because changing a group's shares changes the weight of the super-group - * we need to walk up the tree and change all shares until we hit the root. - */ -static void -move_group_shares(struct task_group *tg, struct sched_domain *sd, - int scpu, int dcpu) -{ - while (tg) { - __move_group_shares(tg, sd, scpu, dcpu); - tg = tg->parent; - } -} - -static -void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) -{ - unsigned long shares = aggregate(tg, sd)->shares; - int i; - - for_each_cpu_mask(i, sd->span) { - struct rq *rq = cpu_rq(i); - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, sd, i); - spin_unlock_irqrestore(&rq->lock, flags); - } - - aggregate_group_shares(tg, sd); - - /* - * ensure we never loose shares due to rounding errors in the - * above redistribution. - */ - shares -= aggregate(tg, sd)->shares; - if (shares) { - tg->cfs_rq[sd->first_cpu]->shares += shares; - aggregate(tg, sd)->shares += shares; - } -} - -/* - * Calculate the accumulative weight and recursive load of each task group - * while walking down the tree. - */ -static -void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) -{ - aggregate_group_weight(tg, sd); - aggregate_group_shares(tg, sd); - aggregate_group_load(tg, sd); -} - -/* - * Rebalance the cpu shares while walking back up the tree. - */ -static -void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) -{ - aggregate_group_set_shares(tg, sd); -} - -static DEFINE_PER_CPU(spinlock_t, aggregate_lock); - -static void __init init_aggregate(void) -{ - int i; - - for_each_possible_cpu(i) - spin_lock_init(&per_cpu(aggregate_lock, i)); -} - -static int get_aggregate(struct sched_domain *sd) -{ - if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) - return 0; - - aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); - return 1; -} - -static void put_aggregate(struct sched_domain *sd) -{ - spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); -} - -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ - cfs_rq->shares = shares; -} - -#else - -static inline void init_aggregate(void) -{ -} - -static inline int get_aggregate(struct sched_domain *sd) -{ - return 0; -} - -static inline void put_aggregate(struct sched_domain *sd) -{ -} -#endif - #else /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1845,14 +1512,26 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #define sched_class_highest (&rt_sched_class) -static void inc_nr_running(struct rq *rq) +static inline void inc_load(struct rq *rq, const struct task_struct *p) +{ + update_load_add(&rq->load, p->se.load.weight); +} + +static inline void dec_load(struct rq *rq, const struct task_struct *p) +{ + update_load_sub(&rq->load, p->se.load.weight); +} + +static void inc_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running++; + inc_load(rq, p); } -static void dec_nr_running(struct rq *rq) +static void dec_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running--; + dec_load(rq, p); } static void set_load_weight(struct task_struct *p) @@ -1944,7 +1623,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) rq->nr_uninterruptible--; enqueue_task(rq, p, wakeup); - inc_nr_running(rq); + inc_nr_running(p, rq); } /* @@ -1956,7 +1635,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) rq->nr_uninterruptible++; dequeue_task(rq, p, sleep); - dec_nr_running(rq); + dec_nr_running(p, rq); } /** @@ -2138,6 +1817,7 @@ void wait_task_inactive(struct task_struct *p) * just go back and repeat. */ rq = task_rq_lock(p, &flags); + trace_kernel_sched_wait(p); running = task_running(rq, p); on_rq = p->se.on_rq; task_rq_unlock(rq, &flags); @@ -2481,6 +2161,7 @@ out_activate: success = 1; out_running: + trace_kernel_sched_wakeup(rq, p); check_preempt_curr(rq, p); p->state = TASK_RUNNING; @@ -2609,8 +2290,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * management (if any): */ p->sched_class->task_new(rq, p); - inc_nr_running(rq); + inc_nr_running(p, rq); } + trace_kernel_sched_wakeup_new(rq, p); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2783,6 +2465,8 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); + + trace_kernel_sched_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* @@ -3015,6 +2699,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) || unlikely(cpu_is_offline(dest_cpu))) goto out; + trace_kernel_sched_migrate_task(p, cpu_of(rq), dest_cpu); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@ -3600,12 +3285,9 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long imbalance; struct rq *busiest; unsigned long flags; - int unlock_aggregate; cpus_setall(*cpus); - unlock_aggregate = get_aggregate(sd); - /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, @@ -3721,9 +3403,8 @@ redo: if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - - goto out; + return -1; + return ld_moved; out_balanced: schedstat_inc(sd, lb_balanced[idle]); @@ -3738,13 +3419,8 @@ out_one_pinned: if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - else - ld_moved = 0; -out: - if (unlock_aggregate) - put_aggregate(sd); - return ld_moved; + return -1; + return 0; } /* @@ -4362,26 +4038,44 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + +static inline unsigned long get_parent_ip(unsigned long addr) +{ + if (in_lock_functions(addr)) { + addr = CALLER_ADDR2; + if (in_lock_functions(addr)) + addr = CALLER_ADDR3; + } + return addr; +} void __kprobes add_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; +#endif preempt_count() += val; +#ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? */ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); +#endif + if (preempt_count() == val) + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } EXPORT_SYMBOL(add_preempt_count); void __kprobes sub_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ @@ -4393,7 +4087,10 @@ void __kprobes sub_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK))) return; +#endif + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); preempt_count() -= val; } EXPORT_SYMBOL(sub_preempt_count); @@ -4430,7 +4127,7 @@ static inline void schedule_debug(struct task_struct *prev) * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) + if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) __schedule_bug(prev); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -4931,8 +4628,10 @@ void set_user_nice(struct task_struct *p, long nice) goto out_unlock; } on_rq = p->se.on_rq; - if (on_rq) + if (on_rq) { dequeue_task(rq, p, 0); + dec_load(rq, p); + } p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -4942,6 +4641,7 @@ void set_user_nice(struct task_struct *p, long nice) if (on_rq) { enqueue_task(rq, p, 0); + inc_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -5726,7 +5426,7 @@ out_unlock: return retval; } -static const char stat_nam[] = "RSDTtZX"; +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; void sched_show_task(struct task_struct *p) { @@ -5748,12 +5448,7 @@ void sched_show_task(struct task_struct *p) printk(KERN_CONT " %016lx ", thread_saved_pc(p)); #endif #ifdef CONFIG_DEBUG_STACK_USAGE - { - unsigned long *n = end_of_stack(p); - while (!*n) - n++; - free = (unsigned long)n - (unsigned long)end_of_stack(p); - } + free = stack_not_used(p); #endif printk(KERN_CONT "%5lu %5d %6d\n", free, task_pid_nr(p), task_pid_nr(p->real_parent)); @@ -7316,7 +7011,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); sd->span = *cpu_map; - sd->first_cpu = first_cpu(sd->span); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@ -7327,7 +7021,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, NODE); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), &sd->span); - sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7339,7 +7032,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, CPU); set_domain_attribute(sd, attr); sd->span = *nodemask; - sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7351,7 +7043,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, MC); set_domain_attribute(sd, attr); sd->span = cpu_coregroup_map(i); - sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7364,7 +7055,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); sd->span = per_cpu(cpu_sibling_map, i); - sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7568,8 +7258,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) static cpumask_t *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ -static struct sched_domain_attr *dattr_cur; /* attribues of custom domains - in 'doms_cur' */ +static struct sched_domain_attr *dattr_cur; + /* attribues of custom domains in 'doms_cur' */ /* * Special case: If a kmalloc of a doms_cur partition (array of @@ -8034,7 +7724,6 @@ void __init sched_init(void) } #ifdef CONFIG_SMP - init_aggregate(); init_defrootdomain(); #endif @@ -8599,11 +8288,14 @@ void sched_move_task(struct task_struct *tsk) #endif #ifdef CONFIG_FAIR_GROUP_SCHED -static void __set_se_shares(struct sched_entity *se, unsigned long shares) +static void set_se_shares(struct sched_entity *se, unsigned long shares) { struct cfs_rq *cfs_rq = se->cfs_rq; + struct rq *rq = cfs_rq->rq; int on_rq; + spin_lock_irq(&rq->lock); + on_rq = se->on_rq; if (on_rq) dequeue_entity(cfs_rq, se, 0); @@ -8613,17 +8305,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares) if (on_rq) enqueue_entity(cfs_rq, se, 0); -} - -static void set_se_shares(struct sched_entity *se, unsigned long shares) -{ - struct cfs_rq *cfs_rq = se->cfs_rq; - struct rq *rq = cfs_rq->rq; - unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); - __set_se_shares(se, shares); - spin_unlock_irqrestore(&rq->lock, flags); + spin_unlock_irq(&rq->lock); } static DEFINE_MUTEX(shares_mutex); @@ -8662,13 +8345,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * w/o tripping rebalance_share or load_balance_fair. */ tg->shares = shares; - for_each_possible_cpu(i) { - /* - * force a rebalance - */ - cfs_rq_set_shares(tg->cfs_rq[i], 0); + for_each_possible_cpu(i) set_se_shares(tg->se[i], shares); - } /* * Enable load balance activity on this group, by inserting it back on diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 9c597e37f7de..ce05271219ab 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -59,22 +59,26 @@ static inline struct sched_clock_data *cpu_sdc(int cpu) return &per_cpu(sched_clock_data, cpu); } +static __read_mostly int sched_clock_running; + void sched_clock_init(void) { u64 ktime_now = ktime_to_ns(ktime_get()); - u64 now = 0; + unsigned long now_jiffies = jiffies; int cpu; for_each_possible_cpu(cpu) { struct sched_clock_data *scd = cpu_sdc(cpu); scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - scd->prev_jiffies = jiffies; - scd->prev_raw = now; - scd->tick_raw = now; + scd->prev_jiffies = now_jiffies; + scd->prev_raw = 0; + scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; } + + sched_clock_running = 1; } /* @@ -136,6 +140,9 @@ u64 sched_clock_cpu(int cpu) struct sched_clock_data *scd = cpu_sdc(cpu); u64 now, clock; + if (unlikely(!sched_clock_running)) + return 0ull; + WARN_ON_ONCE(!irqs_disabled()); now = sched_clock(); @@ -174,6 +181,9 @@ void sched_clock_tick(void) struct sched_clock_data *scd = this_scd(); u64 now, now_gtod; + if (unlikely(!sched_clock_running)) + return; + WARN_ON_ONCE(!irqs_disabled()); now = sched_clock(); diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5f06118fbc31..8bb713040ac9 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -167,11 +167,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #endif SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", cfs_rq->nr_spread_over); -#ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); -#endif -#endif } static void print_cpu(struct seq_file *m, int cpu) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e24ecd39c4b8..08ae848b71d4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -334,34 +334,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, #endif /* - * delta *= w / rw - */ -static inline unsigned long -calc_delta_weight(unsigned long delta, struct sched_entity *se) -{ - for_each_sched_entity(se) { - delta = calc_delta_mine(delta, - se->load.weight, &cfs_rq_of(se)->load); - } - - return delta; -} - -/* - * delta *= rw / w - */ -static inline unsigned long -calc_delta_fair(unsigned long delta, struct sched_entity *se) -{ - for_each_sched_entity(se) { - delta = calc_delta_mine(delta, - cfs_rq_of(se)->load.weight, &se->load); - } - - return delta; -} - -/* * The idea is to set a period in which each task runs once. * * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch @@ -390,54 +362,47 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); + u64 slice = __sched_period(cfs_rq->nr_running); + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + slice *= se->load.weight; + do_div(slice, cfs_rq->load.weight); + } + + + return slice; } /* * We calculate the vruntime slice of a to be inserted task * - * vs = s*rw/w = p + * vs = s/w = p/rw */ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long nr_running = cfs_rq->nr_running; + unsigned long weight; + u64 vslice; if (!se->on_rq) nr_running++; - return __sched_period(nr_running); -} - -/* - * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in - * that it favours >=0 over <0. - * - * -20 | - * | - * 0 --------+------- - * .' - * 19 .' - * - */ -static unsigned long -calc_delta_asym(unsigned long delta, struct sched_entity *se) -{ - struct load_weight lw = { - .weight = NICE_0_LOAD, - .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) - }; + vslice = __sched_period(nr_running); for_each_sched_entity(se) { - struct load_weight *se_lw = &se->load; + cfs_rq = cfs_rq_of(se); - if (se->load.weight < NICE_0_LOAD) - se_lw = &lw; + weight = cfs_rq->load.weight; + if (!se->on_rq) + weight += se->load.weight; - delta = calc_delta_mine(delta, - cfs_rq_of(se)->load.weight, se_lw); + vslice *= NICE_0_LOAD; + do_div(vslice, weight); } - return delta; + return vslice; } /* @@ -454,7 +419,11 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = calc_delta_fair(delta_exec, curr); + delta_exec_weighted = delta_exec; + if (unlikely(curr->load.weight != NICE_0_LOAD)) { + delta_exec_weighted = calc_delta_fair(delta_exec_weighted, + &curr->load); + } curr->vruntime += delta_exec_weighted; } @@ -541,27 +510,10 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ - cfs_rq->task_weight += weight; -} -#else -static inline void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ -} -#endif - static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - inc_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) - add_cfs_task_weight(cfs_rq, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; list_add(&se->group_node, &cfs_rq->tasks); @@ -571,10 +523,6 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - dec_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) - add_cfs_task_weight(cfs_rq, -se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; list_del_init(&se->group_node); @@ -661,17 +609,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (!initial) { /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS)) { - unsigned long thresh = sysctl_sched_latency; - - /* - * convert the sleeper threshold into virtual time - */ - if (sched_feat(NORMALIZED_SLEEPER)) - thresh = calc_delta_fair(thresh, se); - - vruntime -= thresh; - } + if (sched_feat(NEW_FAIR_SLEEPERS)) + vruntime -= sysctl_sched_latency; /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); @@ -1057,16 +996,27 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *curr = this_rq->curr; unsigned long tl = this_load; unsigned long tl_per_task; + int balanced; - if (!(this_sd->flags & SD_WAKE_AFFINE)) + if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) + tl -= current->se.load.weight; + + balanced = 100*(tl + p->se.load.weight) <= imbalance*load; + + /* * If the currently running task will sleep within * a reasonable amount of time then attract this newly * woken task: */ - if (sync && curr->sched_class == &fair_sched_class) { + if (sync && balanced && curr->sched_class == &fair_sched_class) { if (curr->se.avg_overlap < sysctl_sched_migration_cost && p->se.avg_overlap < sysctl_sched_migration_cost) return 1; @@ -1075,16 +1025,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) - tl -= current->se.load.weight; - if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || - 100*(tl + p->se.load.weight) <= imbalance*load) { + balanced) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1169,10 +1111,11 @@ static unsigned long wakeup_gran(struct sched_entity *se) unsigned long gran = sysctl_sched_wakeup_granularity; /* - * More easily preempt - nice tasks, while not making it harder for - * + nice tasks. + * More easily preempt - nice tasks, while not making + * it harder for + nice tasks. */ - gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); + if (unlikely(se->load.weight > NICE_0_LOAD)) + gran = calc_delta_fair(gran, &se->load); return gran; } @@ -1366,90 +1309,75 @@ static struct task_struct *load_balance_next_fair(void *arg) return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); } -static unsigned long -__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, - struct cfs_rq *cfs_rq) +#ifdef CONFIG_FAIR_GROUP_SCHED +static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) { - struct rq_iterator cfs_rq_iterator; + struct sched_entity *curr; + struct task_struct *p; - cfs_rq_iterator.start = load_balance_start_fair; - cfs_rq_iterator.next = load_balance_next_fair; - cfs_rq_iterator.arg = cfs_rq; + if (!cfs_rq->nr_running || !first_fair(cfs_rq)) + return MAX_PRIO; + + curr = cfs_rq->curr; + if (!curr) + curr = __pick_next_entity(cfs_rq); + + p = task_of(curr); - return balance_tasks(this_rq, this_cpu, busiest, - max_load_move, sd, idle, all_pinned, - this_best_prio, &cfs_rq_iterator); + return p->prio; } +#endif -#ifdef CONFIG_FAIR_GROUP_SCHED static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { + struct cfs_rq *busy_cfs_rq; long rem_load_move = max_load_move; - int busiest_cpu = cpu_of(busiest); - struct task_group *tg; - - rcu_read_lock(); - list_for_each_entry(tg, &task_groups, list) { - long imbalance; - unsigned long this_weight, busiest_weight; - long rem_load, max_load, moved_load; - - /* - * empty group - */ - if (!aggregate(tg, sd)->task_weight) - continue; - - rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; - rem_load /= aggregate(tg, sd)->load + 1; - - this_weight = tg->cfs_rq[this_cpu]->task_weight; - busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; + struct rq_iterator cfs_rq_iterator; - imbalance = (busiest_weight - this_weight) / 2; + cfs_rq_iterator.start = load_balance_start_fair; + cfs_rq_iterator.next = load_balance_next_fair; - if (imbalance < 0) - imbalance = busiest_weight; + for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { +#ifdef CONFIG_FAIR_GROUP_SCHED + struct cfs_rq *this_cfs_rq; + long imbalance; + unsigned long maxload; - max_load = max(rem_load, imbalance); - moved_load = __load_balance_fair(this_rq, this_cpu, busiest, - max_load, sd, idle, all_pinned, this_best_prio, - tg->cfs_rq[busiest_cpu]); + this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); - if (!moved_load) + imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; + /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ + if (imbalance <= 0) continue; - move_group_shares(tg, sd, busiest_cpu, this_cpu); + /* Don't pull more than imbalance/2 */ + imbalance /= 2; + maxload = min(rem_load_move, imbalance); - moved_load *= aggregate(tg, sd)->load; - moved_load /= aggregate(tg, sd)->rq_weight + 1; + *this_best_prio = cfs_rq_best_prio(this_cfs_rq); +#else +# define maxload rem_load_move +#endif + /* + * pass busy_cfs_rq argument into + * load_balance_[start|next]_fair iterators + */ + cfs_rq_iterator.arg = busy_cfs_rq; + rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, + maxload, sd, idle, all_pinned, + this_best_prio, + &cfs_rq_iterator); - rem_load_move -= moved_load; - if (rem_load_move < 0) + if (rem_load_move <= 0) break; } - rcu_read_unlock(); return max_load_move - rem_load_move; } -#else -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) -{ - return __load_balance_fair(this_rq, this_cpu, busiest, - max_load_move, sd, idle, all_pinned, - this_best_prio, &busiest->cfs); -} -#endif static int move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 060e87b0cb1c..3432d573205d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -513,8 +513,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) */ for_each_sched_rt_entity(rt_se) enqueue_rt_entity(rt_se); - - inc_cpu_load(rq, p->se.load.weight); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) @@ -534,8 +532,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) if (rt_rq && rt_rq->rt_nr_running) enqueue_rt_entity(rt_se); } - - dec_cpu_load(rq, p->se.load.weight); } /* diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 5bae2e0c3ff2..a38878e0e49d 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -67,6 +67,7 @@ static int show_schedstat(struct seq_file *seq, void *v) preempt_enable(); #endif } + kfree(mask_str); return 0; } diff --git a/kernel/sched_trace.h b/kernel/sched_trace.h new file mode 100644 index 000000000000..29b48f34fd02 --- /dev/null +++ b/kernel/sched_trace.h @@ -0,0 +1,41 @@ +#include <linux/marker.h> + +static inline void trace_kernel_sched_wait(struct task_struct *p) +{ + trace_mark(kernel_sched_wait_task, "pid %d state %ld", + p->pid, p->state); +} + +static inline +void trace_kernel_sched_wakeup(struct rq *rq, struct task_struct *p) +{ + trace_mark(kernel_sched_wakeup, + "pid %d state %ld ## rq %p task %p rq->curr %p", + p->pid, p->state, rq, p, rq->curr); +} + +static inline +void trace_kernel_sched_wakeup_new(struct rq *rq, struct task_struct *p) +{ + trace_mark(kernel_sched_wakeup_new, + "pid %d state %ld ## rq %p task %p rq->curr %p", + p->pid, p->state, rq, p, rq->curr); +} + +static inline void trace_kernel_sched_switch(struct rq *rq, + struct task_struct *prev, struct task_struct *next) +{ + trace_mark(kernel_sched_schedule, + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + prev->pid, next->pid, prev->state, + rq, prev, next); +} + +static inline void +trace_kernel_sched_migrate_task(struct task_struct *p, int src, int dst) +{ + trace_mark(kernel_sched_migrate_task, + "pid %d state %ld dest_cpu %d", + p->pid, p->state, dst); +} diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 5c2942e768cd..bbab232ee185 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -14,7 +14,7 @@ * Some notes on the implementation: * * The spinlock controls access to the other members of the semaphore. - * down_trylock() and up() can be called from interrupt context, so we + * down_try() and up() can be called from interrupt context, so we * have to disable interrupts when taking the lock. It turns out various * parts of the kernel expect to be able to use down() on a semaphore in * interrupt context when they know it will succeed, so we have to use @@ -31,6 +31,7 @@ #include <linux/sched.h> #include <linux/semaphore.h> #include <linux/spinlock.h> +#include <linux/ftrace.h> static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); @@ -53,6 +54,7 @@ void down(struct semaphore *sem) { unsigned long flags; + ftrace_special(sem->count, 0, __LINE__); spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; @@ -114,19 +116,18 @@ int down_killable(struct semaphore *sem) EXPORT_SYMBOL(down_killable); /** - * down_trylock - try to acquire the semaphore, without waiting + * down_try - try to acquire the semaphore, without waiting * @sem: the semaphore to be acquired * - * Try to acquire the semaphore atomically. Returns 0 if the mutex has - * been acquired successfully or 1 if it it cannot be acquired. + * Try to acquire the semaphore atomically. Returns true if the mutex has + * been acquired successfully or 0 if it it cannot be acquired. * - * NOTE: This return value is inverted from both spin_trylock and - * mutex_trylock! Be careful about this when converting code. + * NOTE: This replaces down_trylock() which returned the reverse. * * Unlike mutex_trylock, this function can be used from interrupt context, * and the semaphore can be released by any task or interrupt. */ -int down_trylock(struct semaphore *sem) +int down_try(struct semaphore *sem) { unsigned long flags; int count; @@ -137,9 +138,9 @@ int down_trylock(struct semaphore *sem) sem->count = count; spin_unlock_irqrestore(&sem->lock, flags); - return (count < 0); + return (count >= 0); } -EXPORT_SYMBOL(down_trylock); +EXPORT_SYMBOL(down_try); /** * down_timeout - acquire the semaphore within a specified time diff --git a/kernel/signal.c b/kernel/signal.c index 72bb4f51f963..6c0958e52ea7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -231,6 +231,40 @@ void flush_signals(struct task_struct *t) spin_unlock_irqrestore(&t->sighand->siglock, flags); } +static void __flush_itimer_signals(struct sigpending *pending) +{ + sigset_t signal, retain; + struct sigqueue *q, *n; + + signal = pending->signal; + sigemptyset(&retain); + + list_for_each_entry_safe(q, n, &pending->list, list) { + int sig = q->info.si_signo; + + if (likely(q->info.si_code != SI_TIMER)) { + sigaddset(&retain, sig); + } else { + sigdelset(&signal, sig); + list_del_init(&q->list); + __sigqueue_free(q); + } + } + + sigorsets(&pending->signal, &signal, &retain); +} + +void flush_itimer_signals(void) +{ + struct task_struct *tsk = current; + unsigned long flags; + + spin_lock_irqsave(&tsk->sighand->siglock, flags); + __flush_itimer_signals(&tsk->pending); + __flush_itimer_signals(&tsk->signal->shared_pending); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); +} + void ignore_signals(struct task_struct *t) { int i; @@ -1240,17 +1274,22 @@ void sigqueue_free(struct sigqueue *q) BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); /* - * If the signal is still pending remove it from the - * pending queue. We must hold ->siglock while testing - * q->list to serialize with collect_signal(). + * We must hold ->siglock while testing q->list + * to serialize with collect_signal() or with + * __exit_signal()->flush_sigqueue(). */ spin_lock_irqsave(lock, flags); + q->flags &= ~SIGQUEUE_PREALLOC; + /* + * If it is queued it will be freed when dequeued, + * like the "regular" sigqueue. + */ if (!list_empty(&q->list)) - list_del_init(&q->list); + q = NULL; spin_unlock_irqrestore(lock, flags); - q->flags &= ~SIGQUEUE_PREALLOC; - __sigqueue_free(q); + if (q) + __sigqueue_free(q); } int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) diff --git a/kernel/spinlock.c b/kernel/spinlock.c index ae28c8245123..a1fb54c93cdd 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -436,7 +436,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock) } EXPORT_SYMBOL(_spin_trylock_bh); -int in_lock_functions(unsigned long addr) +notrace int in_lock_functions(unsigned long addr) { /* Linker adds these: start and end of __lockfunc functions */ extern char __lock_text_start[], __lock_text_end[]; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 0101aeef7ed7..36e166def7c7 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -13,202 +13,171 @@ #include <asm/atomic.h> #include <asm/uaccess.h> -/* Since we effect priority and affinity (both of which are visible - * to, and settable by outside processes) we do indirection via a - * kthread. */ - -/* Thread to stop each CPU in user context. */ +/* This controls the threads on each CPU. */ enum stopmachine_state { - STOPMACHINE_WAIT, - STOPMACHINE_PREPARE, + /* Dummy starting state for thread. */ + STOPMACHINE_NONE, + /* Disable interrupts. */ STOPMACHINE_DISABLE_IRQ, + /* Run the function */ + STOPMACHINE_RUN, + /* Exit */ STOPMACHINE_EXIT, + /* Everyone exited. */ + STOPMACHINE_COMPLETE, }; +static enum stopmachine_state state; -static enum stopmachine_state stopmachine_state; -static unsigned int stopmachine_num_threads; -static atomic_t stopmachine_thread_ack; +struct stop_machine_data { + int (*fn)(void *); + void *data; + int fnret; +}; + +/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ +static unsigned int num_threads; +static atomic_t thread_ack; +static struct completion finished; -static int stopmachine(void *cpu) +static void set_state(enum stopmachine_state newstate) { - int irqs_disabled = 0; - int prepared = 0; + /* Reset ack counter. */ + atomic_set(&thread_ack, num_threads); + smp_wmb(); + state = newstate; +} - set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu)); +/* Last one to ack a state moves to the next state. */ +static void ack_state(void) +{ + if (atomic_dec_and_test(&thread_ack)) { + set_state(state + 1); + if (state == STOPMACHINE_COMPLETE) + complete(&finished); + } +} - /* Ack: we are alive */ - smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ - atomic_inc(&stopmachine_thread_ack); +/* This is the actual thread which stops the CPU. It exits by itself rather + * than waiting for kthread_stop(), because it's easier for hotplug CPU. */ +static int stop_cpu(struct stop_machine_data *smdata) +{ + enum stopmachine_state curstate = STOPMACHINE_NONE; + int uninitialized_var(ret); /* Simple state machine */ - while (stopmachine_state != STOPMACHINE_EXIT) { - if (stopmachine_state == STOPMACHINE_DISABLE_IRQ - && !irqs_disabled) { - local_irq_disable(); - hard_irq_disable(); - irqs_disabled = 1; - /* Ack: irqs disabled. */ - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - } else if (stopmachine_state == STOPMACHINE_PREPARE - && !prepared) { - /* Everyone is in place, hold CPU. */ - preempt_disable(); - prepared = 1; - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); + do { + /* Chill out and ensure we re-read stopmachine_state. */ + cpu_relax(); + if (state != curstate) { + curstate = state; + switch (curstate) { + case STOPMACHINE_DISABLE_IRQ: + local_irq_disable(); + hard_irq_disable(); + break; + case STOPMACHINE_RUN: + /* |= allows error detection if functions on + * multiple CPUs. */ + smdata->fnret |= smdata->fn(smdata->data); + break; + default: + break; + } + ack_state(); } - /* Yield in first stage: migration threads need to - * help our sisters onto their CPUs. */ - if (!prepared && !irqs_disabled) - yield(); - else - cpu_relax(); - } - - /* Ack: we are exiting. */ - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); + } while (curstate < STOPMACHINE_EXIT); - if (irqs_disabled) - local_irq_enable(); - if (prepared) - preempt_enable(); - - return 0; + local_irq_enable(); + do_exit(0); } -/* Change the thread state */ -static void stopmachine_set_state(enum stopmachine_state state) +/* Callback for CPUs which aren't supposed to do anything. */ +static int chill(void *unused) { - atomic_set(&stopmachine_thread_ack, 0); - smp_wmb(); - stopmachine_state = state; - while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) - cpu_relax(); + return 0; } -static int stop_machine(void) +int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) { - int i, ret = 0; - - atomic_set(&stopmachine_thread_ack, 0); - stopmachine_num_threads = 0; - stopmachine_state = STOPMACHINE_WAIT; - - for_each_online_cpu(i) { - if (i == raw_smp_processor_id()) - continue; - ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); - if (ret < 0) - break; - stopmachine_num_threads++; - } + int i, err; + struct stop_machine_data active, idle; + struct task_struct **threads; - /* Wait for them all to come to life. */ - while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) - yield(); + active.fn = fn; + active.data = data; + active.fnret = 0; + idle.fn = chill; + idle.data = NULL; - /* If some failed, kill them all. */ - if (ret < 0) { - stopmachine_set_state(STOPMACHINE_EXIT); - return ret; - } - - /* Now they are all started, make them hold the CPUs, ready. */ - preempt_disable(); - stopmachine_set_state(STOPMACHINE_PREPARE); + /* If they don't care which cpu fn runs on, just pick one. */ + if (cpu == NR_CPUS) + cpu = any_online_cpu(cpu_online_map); - /* Make them disable irqs. */ - local_irq_disable(); - hard_irq_disable(); - stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); + /* This could be too big for stack on large machines. */ + threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL); + if (!threads) + return -ENOMEM; - return 0; -} + /* Set up initial state. */ + init_completion(&finished); + num_threads = num_online_cpus(); + set_state(STOPMACHINE_DISABLE_IRQ); -static void restart_machine(void) -{ - stopmachine_set_state(STOPMACHINE_EXIT); - local_irq_enable(); - preempt_enable_no_resched(); -} - -struct stop_machine_data { - int (*fn)(void *); - void *data; - struct completion done; -}; + for_each_online_cpu(i) { + struct stop_machine_data *smdata; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; -static int do_stop(void *_smdata) -{ - struct stop_machine_data *smdata = _smdata; - int ret; + if (cpu == ALL_CPUS || i == cpu) + smdata = &active; + else + smdata = &idle; - ret = stop_machine(); - if (ret == 0) { - ret = smdata->fn(smdata->data); - restart_machine(); - } + threads[i] = kthread_create(stop_cpu, smdata, "kstop%u", i); + if (IS_ERR(threads[i])) { + err = PTR_ERR(threads[i]); + threads[i] = NULL; + goto kill_threads; + } - /* We're done: you can kthread_stop us now */ - complete(&smdata->done); + /* Place it onto correct cpu. */ + kthread_bind(threads[i], i); - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); + /* Make it highest prio. */ + if (sched_setscheduler(threads[i], SCHED_FIFO, ¶m) != 0) + BUG(); } - __set_current_state(TASK_RUNNING); - return ret; -} - -struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, - unsigned int cpu) -{ - static DEFINE_MUTEX(stopmachine_mutex); - struct stop_machine_data smdata; - struct task_struct *p; - smdata.fn = fn; - smdata.data = data; - init_completion(&smdata.done); + /* We've created all the threads. Wake them all: hold this CPU so one + * doesn't hit this CPU until we're ready. */ + cpu = get_cpu(); + for_each_online_cpu(i) + wake_up_process(threads[i]); - mutex_lock(&stopmachine_mutex); + /* This will release the thread on our CPU. */ + put_cpu(); + wait_for_completion(&finished); - /* If they don't care which CPU fn runs on, bind to any online one. */ - if (cpu == NR_CPUS) - cpu = raw_smp_processor_id(); + kfree(threads); - p = kthread_create(do_stop, &smdata, "kstopmachine"); - if (!IS_ERR(p)) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + return active.fnret; - /* One high-prio thread per cpu. We'll do this one. */ - sched_setscheduler(p, SCHED_FIFO, ¶m); - kthread_bind(p, cpu); - wake_up_process(p); - wait_for_completion(&smdata.done); - } - mutex_unlock(&stopmachine_mutex); - return p; +kill_threads: + for_each_online_cpu(i) + if (threads[i]) + kthread_stop(threads[i]); + kfree(threads); + return err; } -int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) +int stop_machine_run_notype(int (*fn)(void *), void *data, unsigned int cpu) { - struct task_struct *p; int ret; /* No CPUs can come up or down during this. */ get_online_cpus(); - p = __stop_machine_run(fn, data, cpu); - if (!IS_ERR(p)) - ret = kthread_stop(p); - else - ret = PTR_ERR(p); + ret = __stop_machine_run(fn, data, cpu); put_online_cpus(); return ret; } -EXPORT_SYMBOL_GPL(stop_machine_run); +EXPORT_SYMBOL_GPL(stop_machine_run_notype); diff --git a/kernel/sys.c b/kernel/sys.c index 895d2d4c9493..14e97282eb6c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1652,7 +1652,7 @@ asmlinkage long sys_umask(int mask) asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { - long uninitialized_var(error); + long error = 0; if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) return error; @@ -1701,9 +1701,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = PR_TIMING_STATISTICAL; break; case PR_SET_TIMING: - if (arg2 == PR_TIMING_STATISTICAL) - error = 0; - else + if (arg2 != PR_TIMING_STATISTICAL) error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d7ffdc59816a..da2279d012d8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -46,6 +46,7 @@ #include <linux/nfs_fs.h> #include <linux/acpi.h> #include <linux/reboot.h> +#include <linux/ftrace.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -81,6 +82,7 @@ extern int compat_log; extern int maps_protect; extern int sysctl_stat_interval; extern int latencytop_enabled; +extern int sysctl_nr_open_min, sysctl_nr_open_max; /* Constants used for minimum and maximum */ #if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM) @@ -131,8 +133,6 @@ extern int sysctl_userprocess_debug; extern int spin_retry; #endif -extern int sysctl_hz_timer; - #ifdef CONFIG_BSD_PROCESS_ACCT extern int acct_parm[]; #endif @@ -454,6 +454,16 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_FTRACE + { + .ctl_name = CTL_UNNUMBERED, + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ftrace_enable_sysctl, + }, +#endif #ifdef CONFIG_KMOD { .ctl_name = KERN_MODPROBE, @@ -562,16 +572,6 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif -#ifdef CONFIG_NO_IDLE_HZ - { - .ctl_name = KERN_HZ_TIMER, - .procname = "hz_timer", - .data = &sysctl_hz_timer, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif { .ctl_name = KERN_S390_USER_DEBUG_LOGGING, .procname = "userprocess_debug", @@ -1190,7 +1190,9 @@ static struct ctl_table fs_table[] = { .data = &sysctl_nr_open, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &sysctl_nr_open_min, + .extra2 = &sysctl_nr_open_max, }, { .ctl_name = FS_DENTRY, diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c deleted file mode 100644 index 06b6395b45b2..000000000000 --- a/kernel/test_kprobes.c +++ /dev/null @@ -1,228 +0,0 @@ -/* - * test_kprobes.c - simple sanity test for *probes - * - * Copyright IBM Corp. 2008 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. - */ - -#include <linux/kernel.h> -#include <linux/kprobes.h> -#include <linux/random.h> - -#define div_factor 3 - -static u32 rand1, preh_val, posth_val, jph_val; -static int errors, handler_errors, num_tests; - -static noinline u32 kprobe_target(u32 value) -{ - /* - * gcc ignores noinline on some architectures unless we stuff - * sufficient lard into the function. The get_kprobe() here is - * just for that. - * - * NOTE: We aren't concerned about the correctness of get_kprobe() - * here; hence, this call is neither under !preempt nor with the - * kprobe_mutex held. This is fine(tm) - */ - if (get_kprobe((void *)0xdeadbeef)) - printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n"); - - return (value / div_factor); -} - -static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - preh_val = (rand1 / div_factor); - return 0; -} - -static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) -{ - if (preh_val != (rand1 / div_factor)) { - handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler\n"); - } - posth_val = preh_val + div_factor; -} - -static struct kprobe kp = { - .symbol_name = "kprobe_target", - .pre_handler = kp_pre_handler, - .post_handler = kp_post_handler -}; - -static int test_kprobe(void) -{ - int ret; - - ret = register_kprobe(&kp); - if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobe returned %d\n", ret); - return ret; - } - - ret = kprobe_target(rand1); - unregister_kprobe(&kp); - - if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); - handler_errors++; - } - - if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); - handler_errors++; - } - - return 0; -} - -static u32 j_kprobe_target(u32 value) -{ - if (value != rand1) { - handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in jprobe handler\n"); - } - - jph_val = rand1; - jprobe_return(); - return 0; -} - -static struct jprobe jp = { - .entry = j_kprobe_target, - .kp.symbol_name = "kprobe_target" -}; - -static int test_jprobe(void) -{ - int ret; - - ret = register_jprobe(&jp); - if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobe returned %d\n", ret); - return ret; - } - - ret = kprobe_target(rand1); - unregister_jprobe(&jp); - if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); - handler_errors++; - } - - return 0; -} - -#ifdef CONFIG_KRETPROBES -static u32 krph_val; - -static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - krph_val = (rand1 / div_factor); - return 0; -} - -static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - unsigned long ret = regs_return_value(regs); - - if (ret != (rand1 / div_factor)) { - handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler\n"); - } - if (krph_val == 0) { - handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); - } - - krph_val = rand1; - return 0; -} - -static struct kretprobe rp = { - .handler = return_handler, - .entry_handler = entry_handler, - .kp.symbol_name = "kprobe_target" -}; - -static int test_kretprobe(void) -{ - int ret; - - ret = register_kretprobe(&rp); - if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); - return ret; - } - - ret = kprobe_target(rand1); - unregister_kretprobe(&rp); - if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); - handler_errors++; - } - - return 0; -} -#endif /* CONFIG_KRETPROBES */ - -int init_test_probes(void) -{ - int ret; - - do { - rand1 = random32(); - } while (rand1 <= div_factor); - - printk(KERN_INFO "Kprobe smoke test started\n"); - num_tests++; - ret = test_kprobe(); - if (ret < 0) - errors++; - - num_tests++; - ret = test_jprobe(); - if (ret < 0) - errors++; - -#ifdef CONFIG_KRETPROBES - num_tests++; - ret = test_kretprobe(); - if (ret < 0) - errors++; -#endif /* CONFIG_KRETPROBES */ - - if (errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " - "%d tests failed\n", errors, num_tests); - else if (handler_errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " - "running handlers\n", handler_errors); - else - printk(KERN_INFO "Kprobe smoke test passed successfully\n"); - - return 0; -} diff --git a/kernel/timer.c b/kernel/timer.c index ceacc6626572..ef3fa6906e8f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -812,7 +812,7 @@ static inline void __run_timers(struct tvec_base *base) spin_unlock_irq(&base->lock); } -#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) +#ifdef CONFIG_NO_HZ /* * Find out when the next timer event is due to happen. This * is used on S/390 to stop all activity when a cpus is idle. @@ -947,14 +947,6 @@ unsigned long get_next_timer_interrupt(unsigned long now) return cmp_next_hrtimer_event(now, expires); } - -#ifdef CONFIG_NO_IDLE_HZ -unsigned long next_timer_interrupt(void) -{ - return get_next_timer_interrupt(jiffies); -} -#endif - #endif #ifndef CONFIG_VIRT_CPU_ACCOUNTING diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig new file mode 100644 index 000000000000..263e9e6bbd60 --- /dev/null +++ b/kernel/trace/Kconfig @@ -0,0 +1,135 @@ +# +# Architectures that offer an FTRACE implementation should select HAVE_FTRACE: +# +config HAVE_FTRACE + bool + +config HAVE_DYNAMIC_FTRACE + bool + +config TRACER_MAX_TRACE + bool + +config TRACING + bool + select DEBUG_FS + select STACKTRACE + +config FTRACE + bool "Kernel Function Tracer" + depends on HAVE_FTRACE + select FRAME_POINTER + select TRACING + select CONTEXT_SWITCH_TRACER + help + Enable the kernel to trace every kernel function. This is done + by using a compiler feature to insert a small, 5-byte No-Operation + instruction to the beginning of every kernel function, which NOP + sequence is then dynamically patched into a tracer call when + tracing is enabled by the administrator. If it's runtime disabled + (the bootup default), then the overhead of the instructions is very + small and not measurable even in micro-benchmarks. + +config IRQSOFF_TRACER + bool "Interrupts-off Latency Tracer" + default n + depends on TRACE_IRQFLAGS_SUPPORT + depends on GENERIC_TIME + depends on HAVE_FTRACE + select TRACE_IRQFLAGS + select TRACING + select TRACER_MAX_TRACE + help + This option measures the time spent in irqs-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /debugfs/tracing/tracing_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the preempt-off timing option can be + used together or separately.) + +config PREEMPT_TRACER + bool "Preemption-off Latency Tracer" + default n + depends on GENERIC_TIME + depends on PREEMPT + depends on HAVE_FTRACE + select TRACING + select TRACER_MAX_TRACE + help + This option measures the time spent in preemption off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /debugfs/tracing/tracing_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the irqs-off timing option can be + used together or separately.) + +config SYSPROF_TRACER + bool "Sysprof Tracer" + depends on X86 + select TRACING + help + This tracer provides the trace needed by the 'Sysprof' userspace + tool. + +config SCHED_TRACER + bool "Scheduling Latency Tracer" + depends on HAVE_FTRACE + select TRACING + select CONTEXT_SWITCH_TRACER + select TRACER_MAX_TRACE + help + This tracer tracks the latency of the highest priority task + to be scheduled in, starting from the point it has woken up. + +config CONTEXT_SWITCH_TRACER + bool "Trace process context switches" + depends on HAVE_FTRACE + select TRACING + select MARKERS + help + This tracer gets called from the context switch and records + all switching of tasks. + +config DYNAMIC_FTRACE + bool "enable/disable ftrace tracepoints dynamically" + depends on FTRACE + depends on HAVE_DYNAMIC_FTRACE + default y + help + This option will modify all the calls to ftrace dynamically + (will patch them out of the binary image and replaces them + with a No-Op instruction) as they are called. A table is + created to dynamically enable them again. + + This way a CONFIG_FTRACE kernel is slightly larger, but otherwise + has native performance as long as no tracing is active. + + The changes to the code are done by a kernel thread that + wakes up once a second and checks to see if any ftrace calls + were made. If so, it runs stop_machine (stops all CPUS) + and modifies the code to jump over the call to ftrace. + +config FTRACE_SELFTEST + bool + +config FTRACE_STARTUP_TEST + bool "Perform a startup test on ftrace" + depends on TRACING + select FTRACE_SELFTEST + help + This option performs a series of startup tests on ftrace. On bootup + a series of tests are made to verify that the tracer is + functioning properly. It will do tests on all the configured + tracers of ftrace. diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile new file mode 100644 index 000000000000..71d17de17288 --- /dev/null +++ b/kernel/trace/Makefile @@ -0,0 +1,24 @@ + +# Do not instrument the tracer itself: + +ifdef CONFIG_FTRACE +ORIG_CFLAGS := $(KBUILD_CFLAGS) +KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) + +# selftest needs instrumentation +CFLAGS_trace_selftest_dynamic.o = -pg +obj-y += trace_selftest_dynamic.o +endif + +obj-$(CONFIG_FTRACE) += libftrace.o + +obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o +obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o +obj-$(CONFIG_FTRACE) += trace_functions.o +obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o +obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o +obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o + +libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c new file mode 100644 index 000000000000..f762f5a2d331 --- /dev/null +++ b/kernel/trace/ftrace.c @@ -0,0 +1,1553 @@ +/* + * Infrastructure for profiling code inserted by 'gcc -pg'. + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com> + * + * Originally ported from the -rt patch by: + * Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com> + * + * Based on code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ + +#include <linux/stop_machine.h> +#include <linux/clocksource.h> +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <linux/hardirq.h> +#include <linux/kthread.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/sysctl.h> +#include <linux/ctype.h> +#include <linux/hash.h> +#include <linux/list.h> + +#include "trace.h" + +/* ftrace_enabled is a method to turn ftrace on or off */ +int ftrace_enabled __read_mostly; +static int last_ftrace_enabled; + +/* + * ftrace_disabled is set when an anomaly is discovered. + * ftrace_disabled is much stronger than ftrace_enabled. + */ +static int ftrace_disabled __read_mostly; + +static DEFINE_SPINLOCK(ftrace_lock); +static DEFINE_MUTEX(ftrace_sysctl_lock); + +static struct ftrace_ops ftrace_list_end __read_mostly = +{ + .func = ftrace_stub, +}; + +static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; +ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; + +void ftrace_list_func(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_ops *op = ftrace_list; + + /* in case someone actually ports this to alpha! */ + read_barrier_depends(); + + while (op != &ftrace_list_end) { + /* silly alpha */ + read_barrier_depends(); + op->func(ip, parent_ip); + op = op->next; + }; +} + +/** + * clear_ftrace_function - reset the ftrace function + * + * This NULLs the ftrace function and in essence stops + * tracing. There may be lag + */ +void clear_ftrace_function(void) +{ + ftrace_trace_function = ftrace_stub; +} + +static int __register_ftrace_function(struct ftrace_ops *ops) +{ + /* Should never be called by interrupts */ + spin_lock(&ftrace_lock); + + ops->next = ftrace_list; + /* + * We are entering ops into the ftrace_list but another + * CPU might be walking that list. We need to make sure + * the ops->next pointer is valid before another CPU sees + * the ops pointer included into the ftrace_list. + */ + smp_wmb(); + ftrace_list = ops; + + if (ftrace_enabled) { + /* + * For one func, simply call it directly. + * For more than one func, call the chain. + */ + if (ops->next == &ftrace_list_end) + ftrace_trace_function = ops->func; + else + ftrace_trace_function = ftrace_list_func; + } + + spin_unlock(&ftrace_lock); + + return 0; +} + +static int __unregister_ftrace_function(struct ftrace_ops *ops) +{ + struct ftrace_ops **p; + int ret = 0; + + spin_lock(&ftrace_lock); + + /* + * If we are removing the last function, then simply point + * to the ftrace_stub. + */ + if (ftrace_list == ops && ops->next == &ftrace_list_end) { + ftrace_trace_function = ftrace_stub; + ftrace_list = &ftrace_list_end; + goto out; + } + + for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) + if (*p == ops) + break; + + if (*p != ops) { + ret = -1; + goto out; + } + + *p = (*p)->next; + + if (ftrace_enabled) { + /* If we only have one func left, then call that directly */ + if (ftrace_list == &ftrace_list_end || + ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + } + + out: + spin_unlock(&ftrace_lock); + + return ret; +} + +#ifdef CONFIG_DYNAMIC_FTRACE + +static struct task_struct *ftraced_task; + +enum { + FTRACE_ENABLE_CALLS = (1 << 0), + FTRACE_DISABLE_CALLS = (1 << 1), + FTRACE_UPDATE_TRACE_FUNC = (1 << 2), + FTRACE_ENABLE_MCOUNT = (1 << 3), + FTRACE_DISABLE_MCOUNT = (1 << 4), +}; + +static int ftrace_filtered; + +static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; + +static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); + +static DEFINE_SPINLOCK(ftrace_shutdown_lock); +static DEFINE_MUTEX(ftraced_lock); +static DEFINE_MUTEX(ftrace_regex_lock); + +struct ftrace_page { + struct ftrace_page *next; + unsigned long index; + struct dyn_ftrace records[]; +}; + +#define ENTRIES_PER_PAGE \ + ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) + +/* estimate from running different kernels */ +#define NR_TO_INIT 10000 + +static struct ftrace_page *ftrace_pages_start; +static struct ftrace_page *ftrace_pages; + +static int ftraced_trigger; +static int ftraced_suspend; +static int ftraced_stop; + +static int ftrace_record_suspend; + +static struct dyn_ftrace *ftrace_free_records; + +static inline int +ftrace_ip_in_hash(unsigned long ip, unsigned long key) +{ + struct dyn_ftrace *p; + struct hlist_node *t; + int found = 0; + + hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) { + if (p->ip == ip) { + found = 1; + break; + } + } + + return found; +} + +static inline void +ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) +{ + hlist_add_head_rcu(&node->node, &ftrace_hash[key]); +} + +static void ftrace_free_rec(struct dyn_ftrace *rec) +{ + /* no locking, only called from kstop_machine */ + + rec->ip = (unsigned long)ftrace_free_records; + ftrace_free_records = rec; + rec->flags |= FTRACE_FL_FREE; +} + +static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) +{ + struct dyn_ftrace *rec; + + /* First check for freed records */ + if (ftrace_free_records) { + rec = ftrace_free_records; + + if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { + WARN_ON_ONCE(1); + ftrace_free_records = NULL; + ftrace_disabled = 1; + ftrace_enabled = 0; + return NULL; + } + + ftrace_free_records = (void *)rec->ip; + memset(rec, 0, sizeof(*rec)); + return rec; + } + + if (ftrace_pages->index == ENTRIES_PER_PAGE) { + if (!ftrace_pages->next) + return NULL; + ftrace_pages = ftrace_pages->next; + } + + return &ftrace_pages->records[ftrace_pages->index++]; +} + +static void +ftrace_record_ip(unsigned long ip) +{ + struct dyn_ftrace *node; + unsigned long flags; + unsigned long key; + int resched; + int atomic; + int cpu; + + if (!ftrace_enabled || ftrace_disabled) + return; + + resched = need_resched(); + preempt_disable_notrace(); + + /* + * We simply need to protect against recursion. + * Use the the raw version of smp_processor_id and not + * __get_cpu_var which can call debug hooks that can + * cause a recursive crash here. + */ + cpu = raw_smp_processor_id(); + per_cpu(ftrace_shutdown_disable_cpu, cpu)++; + if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1) + goto out; + + if (unlikely(ftrace_record_suspend)) + goto out; + + key = hash_long(ip, FTRACE_HASHBITS); + + WARN_ON_ONCE(key >= FTRACE_HASHSIZE); + + if (ftrace_ip_in_hash(ip, key)) + goto out; + + atomic = irqs_disabled(); + + spin_lock_irqsave(&ftrace_shutdown_lock, flags); + + /* This ip may have hit the hash before the lock */ + if (ftrace_ip_in_hash(ip, key)) + goto out_unlock; + + /* + * There's a slight race that the ftraced will update the + * hash and reset here. If it is already converted, skip it. + */ + if (ftrace_ip_converted(ip)) + goto out_unlock; + + node = ftrace_alloc_dyn_node(ip); + if (!node) + goto out_unlock; + + node->ip = ip; + + ftrace_add_hash(node, key); + + ftraced_trigger = 1; + + out_unlock: + spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); + out: + per_cpu(ftrace_shutdown_disable_cpu, cpu)--; + + /* prevent recursion with scheduler */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +#define FTRACE_ADDR ((long)(ftrace_caller)) +#define MCOUNT_ADDR ((long)(mcount)) + +static void +__ftrace_replace_code(struct dyn_ftrace *rec, + unsigned char *old, unsigned char *new, int enable) +{ + unsigned long ip, fl; + int failed; + + ip = rec->ip; + + if (ftrace_filtered && enable) { + /* + * If filtering is on: + * + * If this record is set to be filtered and + * is enabled then do nothing. + * + * If this record is set to be filtered and + * it is not enabled, enable it. + * + * If this record is not set to be filtered + * and it is not enabled do nothing. + * + * If this record is set not to trace then + * do nothing. + * + * If this record is not set to be filtered and + * it is enabled, disable it. + */ + fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); + + if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || + (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE)) + return; + + /* + * If it is enabled disable it, + * otherwise enable it! + */ + if (fl == FTRACE_FL_ENABLED) { + /* swap new and old */ + new = old; + old = ftrace_call_replace(ip, FTRACE_ADDR); + rec->flags &= ~FTRACE_FL_ENABLED; + } else { + new = ftrace_call_replace(ip, FTRACE_ADDR); + rec->flags |= FTRACE_FL_ENABLED; + } + } else { + + if (enable) { + /* + * If this record is set not to trace and is + * not enabled, do nothing. + */ + fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED); + if (fl == FTRACE_FL_NOTRACE) + return; + + new = ftrace_call_replace(ip, FTRACE_ADDR); + } else + old = ftrace_call_replace(ip, FTRACE_ADDR); + + if (enable) { + if (rec->flags & FTRACE_FL_ENABLED) + return; + rec->flags |= FTRACE_FL_ENABLED; + } else { + if (!(rec->flags & FTRACE_FL_ENABLED)) + return; + rec->flags &= ~FTRACE_FL_ENABLED; + } + } + + failed = ftrace_modify_code(ip, old, new); + if (failed) { + unsigned long key; + /* It is possible that the function hasn't been converted yet */ + key = hash_long(ip, FTRACE_HASHBITS); + if (!ftrace_ip_in_hash(ip, key)) { + rec->flags |= FTRACE_FL_FAILED; + ftrace_free_rec(rec); + } + + } +} + +static void ftrace_replace_code(int enable) +{ + unsigned char *new = NULL, *old = NULL; + struct dyn_ftrace *rec; + struct ftrace_page *pg; + int i; + + if (enable) + old = ftrace_nop_replace(); + else + new = ftrace_nop_replace(); + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + + /* don't modify code that has already faulted */ + if (rec->flags & FTRACE_FL_FAILED) + continue; + + __ftrace_replace_code(rec, old, new, enable); + } + } +} + +static void ftrace_shutdown_replenish(void) +{ + if (ftrace_pages->next) + return; + + /* allocate another page */ + ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); +} + +static int +ftrace_code_disable(struct dyn_ftrace *rec) +{ + unsigned long ip; + unsigned char *nop, *call; + int failed; + + ip = rec->ip; + + nop = ftrace_nop_replace(); + call = ftrace_call_replace(ip, MCOUNT_ADDR); + + failed = ftrace_modify_code(ip, call, nop); + if (failed) { + rec->flags |= FTRACE_FL_FAILED; + ftrace_free_rec(rec); + return 0; + } + return 1; +} + +static int __ftrace_update_code(void *ignore); + +static int __ftrace_modify_code(void *data) +{ + unsigned long addr; + int *command = data; + + if (*command & FTRACE_ENABLE_CALLS) { + /* + * Update any recorded ips now that we have the + * machine stopped + */ + __ftrace_update_code(NULL); + ftrace_replace_code(1); + } else if (*command & FTRACE_DISABLE_CALLS) + ftrace_replace_code(0); + + if (*command & FTRACE_UPDATE_TRACE_FUNC) + ftrace_update_ftrace_func(ftrace_trace_function); + + if (*command & FTRACE_ENABLE_MCOUNT) { + addr = (unsigned long)ftrace_record_ip; + ftrace_mcount_set(&addr); + } else if (*command & FTRACE_DISABLE_MCOUNT) { + addr = (unsigned long)ftrace_stub; + ftrace_mcount_set(&addr); + } + + return 0; +} + +static void ftrace_run_update_code(int command) +{ + stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); +} + +void ftrace_disable_daemon(void) +{ + /* Stop the daemon from calling kstop_machine */ + mutex_lock(&ftraced_lock); + ftraced_stop = 1; + mutex_unlock(&ftraced_lock); + + ftrace_force_update(); +} + +void ftrace_enable_daemon(void) +{ + mutex_lock(&ftraced_lock); + ftraced_stop = 0; + mutex_unlock(&ftraced_lock); + + ftrace_force_update(); +} + +static ftrace_func_t saved_ftrace_func; + +static void ftrace_startup(void) +{ + int command = 0; + + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftraced_lock); + ftraced_suspend++; + if (ftraced_suspend == 1) + command |= FTRACE_ENABLE_CALLS; + + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) + goto out; + + ftrace_run_update_code(command); + out: + mutex_unlock(&ftraced_lock); +} + +static void ftrace_shutdown(void) +{ + int command = 0; + + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftraced_lock); + ftraced_suspend--; + if (!ftraced_suspend) + command |= FTRACE_DISABLE_CALLS; + + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) + goto out; + + ftrace_run_update_code(command); + out: + mutex_unlock(&ftraced_lock); +} + +static void ftrace_startup_sysctl(void) +{ + int command = FTRACE_ENABLE_MCOUNT; + + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftraced_lock); + /* Force update next time */ + saved_ftrace_func = NULL; + /* ftraced_suspend is true if we want ftrace running */ + if (ftraced_suspend) + command |= FTRACE_ENABLE_CALLS; + + ftrace_run_update_code(command); + mutex_unlock(&ftraced_lock); +} + +static void ftrace_shutdown_sysctl(void) +{ + int command = FTRACE_DISABLE_MCOUNT; + + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftraced_lock); + /* ftraced_suspend is true if ftrace is running */ + if (ftraced_suspend) + command |= FTRACE_DISABLE_CALLS; + + ftrace_run_update_code(command); + mutex_unlock(&ftraced_lock); +} + +static cycle_t ftrace_update_time; +static unsigned long ftrace_update_cnt; +unsigned long ftrace_update_tot_cnt; + +static int __ftrace_update_code(void *ignore) +{ + struct dyn_ftrace *p; + struct hlist_head head; + struct hlist_node *t; + int save_ftrace_enabled; + cycle_t start, stop; + int i; + + /* Don't be recording funcs now */ + ftrace_record_suspend++; + save_ftrace_enabled = ftrace_enabled; + ftrace_enabled = 0; + + start = ftrace_now(raw_smp_processor_id()); + ftrace_update_cnt = 0; + + /* No locks needed, the machine is stopped! */ + for (i = 0; i < FTRACE_HASHSIZE; i++) { + if (hlist_empty(&ftrace_hash[i])) + continue; + + head = ftrace_hash[i]; + INIT_HLIST_HEAD(&ftrace_hash[i]); + + /* all CPUS are stopped, we are safe to modify code */ + hlist_for_each_entry(p, t, &head, node) { + if (ftrace_code_disable(p)) + ftrace_update_cnt++; + } + + } + + stop = ftrace_now(raw_smp_processor_id()); + ftrace_update_time = stop - start; + ftrace_update_tot_cnt += ftrace_update_cnt; + ftraced_trigger = 0; + + ftrace_enabled = save_ftrace_enabled; + ftrace_record_suspend--; + + return 0; +} + +static int ftrace_update_code(void) +{ + if (unlikely(ftrace_disabled) || + !ftrace_enabled || !ftraced_trigger) + return 0; + + stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); + + return 1; +} + +static int ftraced(void *ignore) +{ + unsigned long usecs; + + while (!kthread_should_stop()) { + + set_current_state(TASK_INTERRUPTIBLE); + + /* check once a second */ + schedule_timeout(HZ); + + if (unlikely(ftrace_disabled)) + continue; + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + if (!ftraced_suspend && !ftraced_stop && + ftrace_update_code()) { + usecs = nsecs_to_usecs(ftrace_update_time); + if (ftrace_update_tot_cnt > 100000) { + ftrace_update_tot_cnt = 0; + pr_info("hm, dftrace overflow: %lu change%s" + " (%lu total) in %lu usec%s\n", + ftrace_update_cnt, + ftrace_update_cnt != 1 ? "s" : "", + ftrace_update_tot_cnt, + usecs, usecs != 1 ? "s" : ""); + ftrace_disabled = 1; + WARN_ON_ONCE(1); + } + } + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + ftrace_shutdown_replenish(); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __init ftrace_dyn_table_alloc(void) +{ + struct ftrace_page *pg; + int cnt; + int i; + + /* allocate a few pages */ + ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages_start) + return -1; + + /* + * Allocate a few more pages. + * + * TODO: have some parser search vmlinux before + * final linking to find all calls to ftrace. + * Then we can: + * a) know how many pages to allocate. + * and/or + * b) set up the table then. + * + * The dynamic code is still necessary for + * modules. + */ + + pg = ftrace_pages = ftrace_pages_start; + + cnt = NR_TO_INIT / ENTRIES_PER_PAGE; + + for (i = 0; i < cnt; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + + /* If we fail, we'll try later anyway */ + if (!pg->next) + break; + + pg = pg->next; + } + + return 0; +} + +enum { + FTRACE_ITER_FILTER = (1 << 0), + FTRACE_ITER_CONT = (1 << 1), + FTRACE_ITER_NOTRACE = (1 << 2), +}; + +#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ + +struct ftrace_iterator { + loff_t pos; + struct ftrace_page *pg; + unsigned idx; + unsigned flags; + unsigned char buffer[FTRACE_BUFF_MAX+1]; + unsigned buffer_idx; + unsigned filtered; +}; + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct dyn_ftrace *rec = NULL; + + (*pos)++; + + retry: + if (iter->idx >= iter->pg->index) { + if (iter->pg->next) { + iter->pg = iter->pg->next; + iter->idx = 0; + goto retry; + } + } else { + rec = &iter->pg->records[iter->idx++]; + if ((rec->flags & FTRACE_FL_FAILED) || + ((iter->flags & FTRACE_ITER_FILTER) && + !(rec->flags & FTRACE_FL_FILTER)) || + ((iter->flags & FTRACE_ITER_NOTRACE) && + !(rec->flags & FTRACE_FL_NOTRACE))) { + rec = NULL; + goto retry; + } + } + + iter->pos = *pos; + + return rec; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + loff_t l = -1; + + if (*pos != iter->pos) { + for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l)) + ; + } else { + l = *pos; + p = t_next(m, p, &l); + } + + return p; +} + +static void t_stop(struct seq_file *m, void *p) +{ +} + +static int t_show(struct seq_file *m, void *v) +{ + struct dyn_ftrace *rec = v; + char str[KSYM_SYMBOL_LEN]; + + if (!rec) + return 0; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + + seq_printf(m, "%s\n", str); + + return 0; +} + +static struct seq_operations show_ftrace_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int +ftrace_avail_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->pos = -1; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + + m->private = iter; + } else { + kfree(iter); + } + + return ret; +} + +int ftrace_avail_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_iterator *iter = m->private; + + seq_release(inode, file); + kfree(iter); + + return 0; +} + +static void ftrace_filter_reset(int enable) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; + unsigned i; + + /* keep kstop machine from running */ + preempt_disable(); + if (enable) + ftrace_filtered = 0; + pg = ftrace_pages_start; + while (pg) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + if (rec->flags & FTRACE_FL_FAILED) + continue; + rec->flags &= ~type; + } + pg = pg->next; + } + preempt_enable(); +} + +static int +ftrace_regex_open(struct inode *inode, struct file *file, int enable) +{ + struct ftrace_iterator *iter; + int ret = 0; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + mutex_lock(&ftrace_regex_lock); + if ((file->f_mode & FMODE_WRITE) && + !(file->f_flags & O_APPEND)) + ftrace_filter_reset(enable); + + if (file->f_mode & FMODE_READ) { + iter->pg = ftrace_pages_start; + iter->pos = -1; + iter->flags = enable ? FTRACE_ITER_FILTER : + FTRACE_ITER_NOTRACE; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = iter; + } else + kfree(iter); + } else + file->private_data = iter; + mutex_unlock(&ftrace_regex_lock); + + return ret; +} + +static int +ftrace_filter_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(inode, file, 1); +} + +static int +ftrace_notrace_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(inode, file, 0); +} + +static ssize_t +ftrace_regex_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + if (file->f_mode & FMODE_READ) + return seq_read(file, ubuf, cnt, ppos); + else + return -EPERM; +} + +static loff_t +ftrace_regex_lseek(struct file *file, loff_t offset, int origin) +{ + loff_t ret; + + if (file->f_mode & FMODE_READ) + ret = seq_lseek(file, offset, origin); + else + file->f_pos = ret = 1; + + return ret; +} + +enum { + MATCH_FULL, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + +static void +ftrace_match(unsigned char *buff, int len, int enable) +{ + char str[KSYM_SYMBOL_LEN]; + char *search = NULL; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int type = MATCH_FULL; + unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; + unsigned i, match = 0, search_len = 0; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + search = buff + i + 1; + type = MATCH_END_ONLY; + search_len = len - (i + 1); + } else { + if (type == MATCH_END_ONLY) { + type = MATCH_MIDDLE_ONLY; + } else { + match = i; + type = MATCH_FRONT_ONLY; + } + buff[i] = 0; + break; + } + } + } + + /* keep kstop machine from running */ + preempt_disable(); + if (enable) + ftrace_filtered = 1; + pg = ftrace_pages_start; + while (pg) { + for (i = 0; i < pg->index; i++) { + int matched = 0; + char *ptr; + + rec = &pg->records[i]; + if (rec->flags & FTRACE_FL_FAILED) + continue; + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + switch (type) { + case MATCH_FULL: + if (strcmp(str, buff) == 0) + matched = 1; + break; + case MATCH_FRONT_ONLY: + if (memcmp(str, buff, match) == 0) + matched = 1; + break; + case MATCH_MIDDLE_ONLY: + if (strstr(str, search)) + matched = 1; + break; + case MATCH_END_ONLY: + ptr = strstr(str, search); + if (ptr && (ptr[search_len] == 0)) + matched = 1; + break; + } + if (matched) + rec->flags |= flag; + } + pg = pg->next; + } + preempt_enable(); +} + +static ssize_t +ftrace_regex_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos, int enable) +{ + struct ftrace_iterator *iter; + char ch; + size_t read = 0; + ssize_t ret; + + if (!cnt || cnt < 0) + return 0; + + mutex_lock(&ftrace_regex_lock); + + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + iter = m->private; + } else + iter = file->private_data; + + if (!*ppos) { + iter->flags &= ~FTRACE_ITER_CONT; + iter->buffer_idx = 0; + } + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + + if (!(iter->flags & ~FTRACE_ITER_CONT)) { + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + if (isspace(ch)) { + file->f_pos += read; + ret = read; + goto out; + } + + iter->buffer_idx = 0; + } + + while (cnt && !isspace(ch)) { + if (iter->buffer_idx < FTRACE_BUFF_MAX) + iter->buffer[iter->buffer_idx++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + if (isspace(ch)) { + iter->filtered++; + iter->buffer[iter->buffer_idx] = 0; + ftrace_match(iter->buffer, iter->buffer_idx, enable); + iter->buffer_idx = 0; + } else + iter->flags |= FTRACE_ITER_CONT; + + + file->f_pos += read; + + ret = read; + out: + mutex_unlock(&ftrace_regex_lock); + + return ret; +} + +static ssize_t +ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 1); +} + +static ssize_t +ftrace_notrace_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 0); +} + +static void +ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) +{ + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftrace_regex_lock); + if (reset) + ftrace_filter_reset(enable); + if (buf) + ftrace_match(buf, len, enable); + mutex_unlock(&ftrace_regex_lock); +} + +/** + * ftrace_set_filter - set a function to filter on in ftrace + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + */ +void ftrace_set_filter(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(buf, len, reset, 1); +} + +/** + * ftrace_set_notrace - set a function to not trace in ftrace + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + */ +void ftrace_set_notrace(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(buf, len, reset, 0); +} + +static int +ftrace_regex_release(struct inode *inode, struct file *file, int enable) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_iterator *iter; + + mutex_lock(&ftrace_regex_lock); + if (file->f_mode & FMODE_READ) { + iter = m->private; + + seq_release(inode, file); + } else + iter = file->private_data; + + if (iter->buffer_idx) { + iter->filtered++; + iter->buffer[iter->buffer_idx] = 0; + ftrace_match(iter->buffer, iter->buffer_idx, enable); + } + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + if (iter->filtered && ftraced_suspend && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + kfree(iter); + mutex_unlock(&ftrace_regex_lock); + return 0; +} + +static int +ftrace_filter_release(struct inode *inode, struct file *file) +{ + return ftrace_regex_release(inode, file, 1); +} + +static int +ftrace_notrace_release(struct inode *inode, struct file *file) +{ + return ftrace_regex_release(inode, file, 0); +} + +static ssize_t +ftraced_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + /* don't worry about races */ + char *buf = ftraced_stop ? "disabled\n" : "enabled\n"; + int r = strlen(buf); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +ftraced_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + if (strncmp(buf, "enable", 6) == 0) + val = 1; + else if (strncmp(buf, "disable", 7) == 0) + val = 0; + else { + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + } + + if (val) + ftrace_enable_daemon(); + else + ftrace_disable_daemon(); + + filp->f_pos += cnt; + + return cnt; +} + +static struct file_operations ftrace_avail_fops = { + .open = ftrace_avail_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_avail_release, +}; + +static struct file_operations ftrace_filter_fops = { + .open = ftrace_filter_open, + .read = ftrace_regex_read, + .write = ftrace_filter_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_filter_release, +}; + +static struct file_operations ftrace_notrace_fops = { + .open = ftrace_notrace_open, + .read = ftrace_regex_read, + .write = ftrace_notrace_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_notrace_release, +}; + +static struct file_operations ftraced_fops = { + .open = tracing_open_generic, + .read = ftraced_read, + .write = ftraced_write, +}; + +/** + * ftrace_force_update - force an update to all recording ftrace functions + */ +int ftrace_force_update(void) +{ + int ret = 0; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + + /* + * If ftraced_trigger is not set, then there is nothing + * to update. + */ + if (ftraced_trigger && !ftrace_update_code()) + ret = -EBUSY; + + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + return ret; +} + +static void ftrace_force_shutdown(void) +{ + struct task_struct *task; + int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC; + + mutex_lock(&ftraced_lock); + task = ftraced_task; + ftraced_task = NULL; + ftraced_suspend = -1; + ftrace_run_update_code(command); + mutex_unlock(&ftraced_lock); + + if (task) + kthread_stop(task); +} + +static __init int ftrace_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("available_filter_functions", 0444, + d_tracer, NULL, &ftrace_avail_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'available_filter_functions' entry\n"); + + entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, + NULL, &ftrace_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_filter' entry\n"); + + entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, + NULL, &ftrace_notrace_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_notrace' entry\n"); + + entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer, + NULL, &ftraced_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'ftraced_enabled' entry\n"); + return 0; +} + +fs_initcall(ftrace_init_debugfs); + +static int __init ftrace_dynamic_init(void) +{ + struct task_struct *p; + unsigned long addr; + int ret; + + addr = (unsigned long)ftrace_record_ip; + + stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); + + /* ftrace_dyn_arch_init places the return code in addr */ + if (addr) { + ret = (int)addr; + goto failed; + } + + ret = ftrace_dyn_table_alloc(); + if (ret) + goto failed; + + p = kthread_run(ftraced, NULL, "ftraced"); + if (IS_ERR(p)) { + ret = -1; + goto failed; + } + + last_ftrace_enabled = ftrace_enabled = 1; + ftraced_task = p; + + return 0; + + failed: + ftrace_disabled = 1; + return ret; +} + +core_initcall(ftrace_dynamic_init); +#else +# define ftrace_startup() do { } while (0) +# define ftrace_shutdown() do { } while (0) +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) +# define ftrace_force_shutdown() do { } while (0) +#endif /* CONFIG_DYNAMIC_FTRACE */ + +/** + * ftrace_kill - totally shutdown ftrace + * + * This is a safety measure. If something was detected that seems + * wrong, calling this function will keep ftrace from doing + * any more modifications, and updates. + * used when something went wrong. + */ +void ftrace_kill(void) +{ + mutex_lock(&ftrace_sysctl_lock); + ftrace_disabled = 1; + ftrace_enabled = 0; + + clear_ftrace_function(); + mutex_unlock(&ftrace_sysctl_lock); + + /* Try to totally disable ftrace */ + ftrace_force_shutdown(); +} + +/** + * register_ftrace_function - register a function for profiling + * @ops - ops structure that holds the function for profiling. + * + * Register a function to be called by all functions in the + * kernel. + * + * Note: @ops->func and all the functions it calls must be labeled + * with "notrace", otherwise it will go into a + * recursive loop. + */ +int register_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + if (unlikely(ftrace_disabled)) + return -1; + + mutex_lock(&ftrace_sysctl_lock); + ret = __register_ftrace_function(ops); + ftrace_startup(); + mutex_unlock(&ftrace_sysctl_lock); + + return ret; +} + +/** + * unregister_ftrace_function - unresgister a function for profiling. + * @ops - ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by ftrace profiling. + */ +int unregister_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + mutex_lock(&ftrace_sysctl_lock); + ret = __unregister_ftrace_function(ops); + ftrace_shutdown(); + mutex_unlock(&ftrace_sysctl_lock); + + return ret; +} + +int +ftrace_enable_sysctl(struct ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + mutex_lock(&ftrace_sysctl_lock); + + ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + + if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) + goto out; + + last_ftrace_enabled = ftrace_enabled; + + if (ftrace_enabled) { + + ftrace_startup_sysctl(); + + /* we are starting ftrace again */ + if (ftrace_list != &ftrace_list_end) { + if (ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + else + ftrace_trace_function = ftrace_list_func; + } + + } else { + /* stopping ftrace calls (just send to ftrace_stub) */ + ftrace_trace_function = ftrace_stub; + + ftrace_shutdown_sysctl(); + } + + out: + mutex_unlock(&ftrace_sysctl_lock); + return ret; +} diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c new file mode 100644 index 000000000000..a864dd292e38 --- /dev/null +++ b/kernel/trace/trace.c @@ -0,0 +1,3131 @@ +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + * Originally taken from the RT patch by: + * Arnaldo Carvalho de Melo <acme@redhat.com> + * + * Based on code from the latency_tracer, that is: + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include <linux/utsrelease.h> +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <linux/pagemap.h> +#include <linux/hardirq.h> +#include <linux/linkage.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/poll.h> +#include <linux/gfp.h> +#include <linux/fs.h> +#include <linux/kprobes.h> +#include <linux/writeback.h> + +#include <linux/stacktrace.h> + +#include "trace.h" + +unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; +unsigned long __read_mostly tracing_thresh; + +static unsigned long __read_mostly tracing_nr_buffers; +static cpumask_t __read_mostly tracing_buffer_mask; + +#define for_each_tracing_cpu(cpu) \ + for_each_cpu_mask(cpu, tracing_buffer_mask) + +/* dummy trace to disable tracing */ +static struct tracer no_tracer __read_mostly = { + .name = "none", +}; + +static int trace_alloc_page(void); +static int trace_free_page(void); + +static int tracing_disabled = 1; + +static unsigned long tracing_pages_allocated; + +long +ns2usecs(cycle_t nsec) +{ + nsec += 500; + do_div(nsec, 1000); + return nsec; +} + +cycle_t ftrace_now(int cpu) +{ + return cpu_clock(cpu); +} + +/* + * The global_trace is the descriptor that holds the tracing + * buffers for the live tracing. For each CPU, it contains + * a link list of pages that will store trace entries. The + * page descriptor of the pages in the memory is used to hold + * the link list by linking the lru item in the page descriptor + * to each of the pages in the buffer per CPU. + * + * For each active CPU there is a data field that holds the + * pages for the buffer for that CPU. Each CPU has the same number + * of pages allocated for its buffer. + */ +static struct trace_array global_trace; + +static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); + +/* + * The max_tr is used to snapshot the global_trace when a maximum + * latency is reached. Some tracers will use this to store a maximum + * trace while it continues examining live traces. + * + * The buffers for the max_tr are set up the same as the global_trace. + * When a snapshot is taken, the link list of the max_tr is swapped + * with the link list of the global_trace and the buffers are reset for + * the global_trace so the tracing can continue. + */ +static struct trace_array max_tr; + +static DEFINE_PER_CPU(struct trace_array_cpu, max_data); + +/* tracer_enabled is used to toggle activation of a tracer */ +static int tracer_enabled = 1; + +/* + * trace_nr_entries is the number of entries that is allocated + * for a buffer. Note, the number of entries is always rounded + * to ENTRIES_PER_PAGE. + */ +static unsigned long trace_nr_entries = 65536UL; + +/* trace_types holds a link list of available tracers. */ +static struct tracer *trace_types __read_mostly; + +/* current_trace points to the tracer that is currently active */ +static struct tracer *current_trace __read_mostly; + +/* + * max_tracer_type_len is used to simplify the allocating of + * buffers to read userspace tracer names. We keep track of + * the longest tracer name registered. + */ +static int max_tracer_type_len; + +/* + * trace_types_lock is used to protect the trace_types list. + * This lock is also used to keep user access serialized. + * Accesses from userspace will grab this lock while userspace + * activities happen inside the kernel. + */ +static DEFINE_MUTEX(trace_types_lock); + +/* trace_wait is a waitqueue for tasks blocked on trace_poll */ +static DECLARE_WAIT_QUEUE_HEAD(trace_wait); + +/* trace_flags holds iter_ctrl options */ +unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; + +/** + * trace_wake_up - wake up tasks waiting for trace input + * + * Simply wakes up any task that is blocked on the trace_wait + * queue. These is used with trace_poll for tasks polling the trace. + */ +void trace_wake_up(void) +{ + /* + * The runqueue_is_locked() can fail, but this is the best we + * have for now: + */ + if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) + wake_up(&trace_wait); +} + +#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) + +static int __init set_nr_entries(char *str) +{ + unsigned long nr_entries; + int ret; + + if (!str) + return 0; + ret = strict_strtoul(str, 0, &nr_entries); + /* nr_entries can not be zero */ + if (ret < 0 || nr_entries == 0) + return 0; + trace_nr_entries = nr_entries; + return 1; +} +__setup("trace_entries=", set_nr_entries); + +unsigned long nsecs_to_usecs(unsigned long nsecs) +{ + return nsecs / 1000; +} + +/* + * trace_flag_type is an enumeration that holds different + * states when a trace occurs. These are: + * IRQS_OFF - interrupts were disabled + * NEED_RESCED - reschedule is requested + * HARDIRQ - inside an interrupt handler + * SOFTIRQ - inside a softirq handler + */ +enum trace_flag_type { + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_HARDIRQ = 0x04, + TRACE_FLAG_SOFTIRQ = 0x08, +}; + +/* + * TRACE_ITER_SYM_MASK masks the options in trace_flags that + * control the output of kernel symbols. + */ +#define TRACE_ITER_SYM_MASK \ + (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + +/* These must match the bit postions in trace_iterator_flags */ +static const char *trace_options[] = { + "print-parent", + "sym-offset", + "sym-addr", + "verbose", + "raw", + "hex", + "bin", + "block", + "stacktrace", + "sched-tree", + NULL +}; + +/* + * ftrace_max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a raw_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + */ +static raw_spinlock_t ftrace_max_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /debugfs/tracing/latency_trace) + */ +static void +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + + max_tr.cpu = cpu; + max_tr.time_start = data->preempt_timestamp; + + data = max_tr.data[cpu]; + data->saved_latency = tracing_max_latency; + + memcpy(data->comm, tsk->comm, TASK_COMM_LEN); + data->pid = tsk->pid; + data->uid = tsk->uid; + data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + data->policy = tsk->policy; + data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(current); +} + +#define CHECK_COND(cond) \ + if (unlikely(cond)) { \ + tracing_disabled = 1; \ + WARN_ON(1); \ + return -1; \ + } + +/** + * check_pages - integrity check of trace buffers + * + * As a safty measure we check to make sure the data pages have not + * been corrupted. + */ +int check_pages(struct trace_array_cpu *data) +{ + struct page *page, *tmp; + + CHECK_COND(data->trace_pages.next->prev != &data->trace_pages); + CHECK_COND(data->trace_pages.prev->next != &data->trace_pages); + + list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { + CHECK_COND(page->lru.next->prev != &page->lru); + CHECK_COND(page->lru.prev->next != &page->lru); + } + + return 0; +} + +/** + * head_page - page address of the first page in per_cpu buffer. + * + * head_page returns the page address of the first page in + * a per_cpu buffer. This also preforms various consistency + * checks to make sure the buffer has not been corrupted. + */ +void *head_page(struct trace_array_cpu *data) +{ + struct page *page; + + if (list_empty(&data->trace_pages)) + return NULL; + + page = list_entry(data->trace_pages.next, struct page, lru); + BUG_ON(&page->lru == &data->trace_pages); + + return page_address(page); +} + +/** + * trace_seq_printf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +int +trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +{ + int len = (PAGE_SIZE - 1) - s->len; + va_list ap; + int ret; + + if (!len) + return 0; + + va_start(ap, fmt); + ret = vsnprintf(s->buffer + s->len, len, fmt, ap); + va_end(ap); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) + return 0; + + s->len += ret; + + return len; +} + +/** + * trace_seq_puts - trace sequence printing of simple string + * @s: trace sequence descriptor + * @str: simple string to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple string + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + */ +static int +trace_seq_puts(struct trace_seq *s, const char *str) +{ + int len = strlen(str); + + if (len > ((PAGE_SIZE - 1) - s->len)) + return 0; + + memcpy(s->buffer + s->len, str, len); + s->len += len; + + return len; +} + +static int +trace_seq_putc(struct trace_seq *s, unsigned char c) +{ + if (s->len >= (PAGE_SIZE - 1)) + return 0; + + s->buffer[s->len++] = c; + + return 1; +} + +static int +trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) +{ + if (len > ((PAGE_SIZE - 1) - s->len)) + return 0; + + memcpy(s->buffer + s->len, mem, len); + s->len += len; + + return len; +} + +#define HEX_CHARS 17 +static const char hex2asc[] = "0123456789abcdef"; + +static int +trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) +{ + unsigned char hex[HEX_CHARS]; + unsigned char *data = mem; + unsigned char byte; + int i, j; + + BUG_ON(len >= HEX_CHARS); + +#ifdef __BIG_ENDIAN + for (i = 0, j = 0; i < len; i++) { +#else + for (i = len-1, j = 0; i >= 0; i--) { +#endif + byte = data[i]; + + hex[j++] = hex2asc[byte & 0x0f]; + hex[j++] = hex2asc[byte >> 4]; + } + hex[j++] = ' '; + + return trace_seq_putmem(s, hex, j); +} + +static void +trace_seq_reset(struct trace_seq *s) +{ + s->len = 0; + s->readpos = 0; +} + +ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) +{ + int len; + int ret; + + if (s->len <= s->readpos) + return -EBUSY; + + len = s->len - s->readpos; + if (cnt > len) + cnt = len; + ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); + if (ret) + return -EFAULT; + + s->readpos += len; + return cnt; +} + +static void +trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + + s->buffer[len] = 0; + seq_puts(m, s->buffer); + + trace_seq_reset(s); +} + +/* + * flip the trace buffers between two trace descriptors. + * This usually is the buffers between the global_trace and + * the max_tr to record a snapshot of a current trace. + * + * The ftrace_max_lock must be held. + */ +static void +flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) +{ + struct list_head flip_pages; + + INIT_LIST_HEAD(&flip_pages); + + memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx, + sizeof(struct trace_array_cpu) - + offsetof(struct trace_array_cpu, trace_head_idx)); + + check_pages(tr1); + check_pages(tr2); + list_splice_init(&tr1->trace_pages, &flip_pages); + list_splice_init(&tr2->trace_pages, &tr1->trace_pages); + list_splice_init(&flip_pages, &tr2->trace_pages); + BUG_ON(!list_empty(&flip_pages)); + check_pages(tr1); + check_pages(tr2); +} + +/** + * update_max_tr - snapshot all trace buffers from global_trace to max_tr + * @tr: tracer + * @tsk: the task with the latency + * @cpu: The cpu that initiated the trace. + * + * Flip the buffers between the @tr and the max_tr and record information + * about which task was the cause of this latency. + */ +void +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data; + int i; + + WARN_ON_ONCE(!irqs_disabled()); + __raw_spin_lock(&ftrace_max_lock); + /* clear out all the previous traces */ + for_each_tracing_cpu(i) { + data = tr->data[i]; + flip_trace(max_tr.data[i], data); + tracing_reset(data); + } + + __update_max_tr(tr, tsk, cpu); + __raw_spin_unlock(&ftrace_max_lock); +} + +/** + * update_max_tr_single - only copy one trace over, and reset the rest + * @tr - tracer + * @tsk - task with the latency + * @cpu - the cpu of the buffer to copy. + * + * Flip the trace of a single CPU buffer between the @tr and the max_tr. + */ +void +update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + int i; + + WARN_ON_ONCE(!irqs_disabled()); + __raw_spin_lock(&ftrace_max_lock); + for_each_tracing_cpu(i) + tracing_reset(max_tr.data[i]); + + flip_trace(max_tr.data[cpu], data); + tracing_reset(data); + + __update_max_tr(tr, tsk, cpu); + __raw_spin_unlock(&ftrace_max_lock); +} + +/** + * register_tracer - register a tracer with the ftrace system. + * @type - the plugin for the tracer + * + * Register a new plugin tracer. + */ +int register_tracer(struct tracer *type) +{ + struct tracer *t; + int len; + int ret = 0; + + if (!type->name) { + pr_info("Tracer must have a name\n"); + return -1; + } + + mutex_lock(&trace_types_lock); + for (t = trace_types; t; t = t->next) { + if (strcmp(type->name, t->name) == 0) { + /* already found */ + pr_info("Trace %s already registered\n", + type->name); + ret = -1; + goto out; + } + } + +#ifdef CONFIG_FTRACE_STARTUP_TEST + if (type->selftest) { + struct tracer *saved_tracer = current_trace; + struct trace_array_cpu *data; + struct trace_array *tr = &global_trace; + int saved_ctrl = tr->ctrl; + int i; + /* + * Run a selftest on this tracer. + * Here we reset the trace buffer, and set the current + * tracer to be this tracer. The tracer can then run some + * internal tracing to verify that everything is in order. + * If we fail, we do not register this tracer. + */ + for_each_tracing_cpu(i) { + data = tr->data[i]; + if (!head_page(data)) + continue; + tracing_reset(data); + } + current_trace = type; + tr->ctrl = 0; + /* the test is responsible for initializing and enabling */ + pr_info("Testing tracer %s: ", type->name); + ret = type->selftest(type, tr); + /* the test is responsible for resetting too */ + current_trace = saved_tracer; + tr->ctrl = saved_ctrl; + if (ret) { + printk(KERN_CONT "FAILED!\n"); + goto out; + } + /* Only reset on passing, to avoid touching corrupted buffers */ + for_each_tracing_cpu(i) { + data = tr->data[i]; + if (!head_page(data)) + continue; + tracing_reset(data); + } + printk(KERN_CONT "PASSED\n"); + } +#endif + + type->next = trace_types; + trace_types = type; + len = strlen(type->name); + if (len > max_tracer_type_len) + max_tracer_type_len = len; + + out: + mutex_unlock(&trace_types_lock); + + return ret; +} + +void unregister_tracer(struct tracer *type) +{ + struct tracer **t; + int len; + + mutex_lock(&trace_types_lock); + for (t = &trace_types; *t; t = &(*t)->next) { + if (*t == type) + goto found; + } + pr_info("Trace %s not registered\n", type->name); + goto out; + + found: + *t = (*t)->next; + if (strlen(type->name) != max_tracer_type_len) + goto out; + + max_tracer_type_len = 0; + for (t = &trace_types; *t; t = &(*t)->next) { + len = strlen((*t)->name); + if (len > max_tracer_type_len) + max_tracer_type_len = len; + } + out: + mutex_unlock(&trace_types_lock); +} + +void tracing_reset(struct trace_array_cpu *data) +{ + data->trace_idx = 0; + data->overrun = 0; + data->trace_head = data->trace_tail = head_page(data); + data->trace_head_idx = 0; + data->trace_tail_idx = 0; +} + +#define SAVED_CMDLINES 128 +static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; +static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; +static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; +static int cmdline_idx; +static DEFINE_SPINLOCK(trace_cmdline_lock); + +/* temporary disable recording */ +atomic_t trace_record_cmdline_disabled __read_mostly; + +static void trace_init_cmdlines(void) +{ + memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline)); + memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid)); + cmdline_idx = 0; +} + +void trace_stop_cmdline_recording(void); + +static void trace_save_cmdline(struct task_struct *tsk) +{ + unsigned map; + unsigned idx; + + if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) + return; + + /* + * It's not the end of the world if we don't get + * the lock, but we also don't want to spin + * nor do we want to disable interrupts, + * so if we miss here, then better luck next time. + */ + if (!spin_trylock(&trace_cmdline_lock)) + return; + + idx = map_pid_to_cmdline[tsk->pid]; + if (idx >= SAVED_CMDLINES) { + idx = (cmdline_idx + 1) % SAVED_CMDLINES; + + map = map_cmdline_to_pid[idx]; + if (map <= PID_MAX_DEFAULT) + map_pid_to_cmdline[map] = (unsigned)-1; + + map_pid_to_cmdline[tsk->pid] = idx; + + cmdline_idx = idx; + } + + memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); + + spin_unlock(&trace_cmdline_lock); +} + +static char *trace_find_cmdline(int pid) +{ + char *cmdline = "<...>"; + unsigned map; + + if (!pid) + return "<idle>"; + + if (pid > PID_MAX_DEFAULT) + goto out; + + map = map_pid_to_cmdline[pid]; + if (map >= SAVED_CMDLINES) + goto out; + + cmdline = saved_cmdlines[map]; + + out: + return cmdline; +} + +void tracing_record_cmdline(struct task_struct *tsk) +{ + if (atomic_read(&trace_record_cmdline_disabled)) + return; + + trace_save_cmdline(tsk); +} + +static inline struct list_head * +trace_next_list(struct trace_array_cpu *data, struct list_head *next) +{ + /* + * Roundrobin - but skip the head (which is not a real page): + */ + next = next->next; + if (unlikely(next == &data->trace_pages)) + next = next->next; + BUG_ON(next == &data->trace_pages); + + return next; +} + +static inline void * +trace_next_page(struct trace_array_cpu *data, void *addr) +{ + struct list_head *next; + struct page *page; + + page = virt_to_page(addr); + + next = trace_next_list(data, &page->lru); + page = list_entry(next, struct page, lru); + + return page_address(page); +} + +static inline struct trace_entry * +tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) +{ + unsigned long idx, idx_next; + struct trace_entry *entry; + + data->trace_idx++; + idx = data->trace_head_idx; + idx_next = idx + 1; + + BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE); + + entry = data->trace_head + idx * TRACE_ENTRY_SIZE; + + if (unlikely(idx_next >= ENTRIES_PER_PAGE)) { + data->trace_head = trace_next_page(data, data->trace_head); + idx_next = 0; + } + + if (data->trace_head == data->trace_tail && + idx_next == data->trace_tail_idx) { + /* overrun */ + data->overrun++; + data->trace_tail_idx++; + if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { + data->trace_tail = + trace_next_page(data, data->trace_tail); + data->trace_tail_idx = 0; + } + } + + data->trace_head_idx = idx_next; + + return entry; +} + +static inline void +tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) +{ + struct task_struct *tsk = current; + unsigned long pc; + + pc = preempt_count(); + + entry->preempt_count = pc & 0xff; + entry->pid = (tsk) ? tsk->pid : 0; + entry->t = ftrace_now(raw_smp_processor_id()); + entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | + ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | + ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); +} + +void +trace_function(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, unsigned long flags) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_FN; + entry->fn.ip = ip; + entry->fn.parent_ip = parent_ip; + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); +} + +void +ftrace(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, unsigned long flags) +{ + if (likely(!atomic_read(&data->disabled))) + trace_function(tr, data, ip, parent_ip, flags); +} + +#ifdef CONFIG_MMIOTRACE +void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, + struct mmiotrace_rw *rw) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_MMIO_RW; + entry->mmiorw = *rw; + + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} + +void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, + struct mmiotrace_map *map) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_MMIO_MAP; + entry->mmiomap = *map; + + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} +#endif + +void __trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip) +{ + struct trace_entry *entry; + struct stack_trace trace; + + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_STACK; + + memset(&entry->stack, 0, sizeof(entry->stack)); + + trace.nr_entries = 0; + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.skip = skip; + trace.entries = entry->stack.caller; + + save_stack_trace(&trace); +} + +void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + struct trace_array_cpu *data = __data; + struct trace_array *tr = __tr; + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_SPECIAL; + entry->special.arg1 = arg1; + entry->special.arg2 = arg2; + entry->special.arg3 = arg3; + __trace_stack(tr, data, irq_flags, 4); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} + +void +tracing_sched_switch_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_CTX; + entry->ctx.prev_pid = prev->pid; + entry->ctx.prev_prio = prev->prio; + entry->ctx.prev_state = prev->state; + entry->ctx.next_pid = next->pid; + entry->ctx.next_prio = next->prio; + entry->ctx.next_state = next->state; + __trace_stack(tr, data, flags, 5); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); +} + +void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *wakee, + struct task_struct *curr, + unsigned long flags) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_WAKE; + entry->ctx.prev_pid = curr->pid; + entry->ctx.prev_prio = curr->prio; + entry->ctx.prev_state = curr->state; + entry->ctx.next_pid = wakee->pid; + entry->ctx.next_prio = wakee->prio; + entry->ctx.next_state = wakee->state; + __trace_stack(tr, data, flags, 6); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} + +void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + __trace_special(tr, data, arg1, arg2, arg3); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +#ifdef CONFIG_FTRACE +static void +function_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (unlikely(!tracer_enabled)) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + trace_function(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = function_trace_call, +}; + +void tracing_start_function_trace(void) +{ + register_ftrace_function(&trace_ops); +} + +void tracing_stop_function_trace(void) +{ + unregister_ftrace_function(&trace_ops); +} +#endif + +enum trace_file_type { + TRACE_FILE_LAT_FMT = 1, +}; + +static struct trace_entry * +trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, + struct trace_iterator *iter, int cpu) +{ + struct page *page; + struct trace_entry *array; + + if (iter->next_idx[cpu] >= tr->entries || + iter->next_idx[cpu] >= data->trace_idx || + (data->trace_head == data->trace_tail && + data->trace_head_idx == data->trace_tail_idx)) + return NULL; + + if (!iter->next_page[cpu]) { + /* Initialize the iterator for this cpu trace buffer */ + WARN_ON(!data->trace_tail); + page = virt_to_page(data->trace_tail); + iter->next_page[cpu] = &page->lru; + iter->next_page_idx[cpu] = data->trace_tail_idx; + } + + page = list_entry(iter->next_page[cpu], struct page, lru); + BUG_ON(&data->trace_pages == &page->lru); + + array = page_address(page); + + WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE); + return &array[iter->next_page_idx[cpu]]; +} + +static struct trace_entry * +find_next_entry(struct trace_iterator *iter, int *ent_cpu) +{ + struct trace_array *tr = iter->tr; + struct trace_entry *ent, *next = NULL; + int next_cpu = -1; + int cpu; + + for_each_tracing_cpu(cpu) { + if (!head_page(tr->data[cpu])) + continue; + ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); + /* + * Pick the entry with the smallest timestamp: + */ + if (ent && (!next || ent->t < next->t)) { + next = ent; + next_cpu = cpu; + } + } + + if (ent_cpu) + *ent_cpu = next_cpu; + + return next; +} + +static void trace_iterator_increment(struct trace_iterator *iter) +{ + iter->idx++; + iter->next_idx[iter->cpu]++; + iter->next_page_idx[iter->cpu]++; + + if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) { + struct trace_array_cpu *data = iter->tr->data[iter->cpu]; + + iter->next_page_idx[iter->cpu] = 0; + iter->next_page[iter->cpu] = + trace_next_list(data, iter->next_page[iter->cpu]); + } +} + +static void trace_consume(struct trace_iterator *iter) +{ + struct trace_array_cpu *data = iter->tr->data[iter->cpu]; + + data->trace_tail_idx++; + if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { + data->trace_tail = trace_next_page(data, data->trace_tail); + data->trace_tail_idx = 0; + } + + /* Check if we empty it, then reset the index */ + if (data->trace_head == data->trace_tail && + data->trace_head_idx == data->trace_tail_idx) + data->trace_idx = 0; +} + +static void *find_next_entry_inc(struct trace_iterator *iter) +{ + struct trace_entry *next; + int next_cpu = -1; + + next = find_next_entry(iter, &next_cpu); + + iter->prev_ent = iter->ent; + iter->prev_cpu = iter->cpu; + + iter->ent = next; + iter->cpu = next_cpu; + + if (next) + trace_iterator_increment(iter); + + return next ? iter : NULL; +} + +static void *s_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + void *last_ent = iter->ent; + int i = (int)*pos; + void *ent; + + (*pos)++; + + /* can't go backwards */ + if (iter->idx > i) + return NULL; + + if (iter->idx < 0) + ent = find_next_entry_inc(iter); + else + ent = iter; + + while (ent && iter->idx < i) + ent = find_next_entry_inc(iter); + + iter->pos = *pos; + + if (last_ent && !ent) + seq_puts(m, "\n\nvim:ft=help\n"); + + return ent; +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + void *p = NULL; + loff_t l = 0; + int i; + + mutex_lock(&trace_types_lock); + + if (!current_trace || current_trace != iter->trace) { + mutex_unlock(&trace_types_lock); + return NULL; + } + + atomic_inc(&trace_record_cmdline_disabled); + + /* let the tracer grab locks here if needed */ + if (current_trace->start) + current_trace->start(iter); + + if (*pos != iter->pos) { + iter->ent = NULL; + iter->cpu = 0; + iter->idx = -1; + iter->prev_ent = NULL; + iter->prev_cpu = -1; + + for_each_tracing_cpu(i) { + iter->next_idx[i] = 0; + iter->next_page[i] = NULL; + } + + for (p = iter; p && l < *pos; p = s_next(m, p, &l)) + ; + + } else { + l = *pos - 1; + p = s_next(m, p, &l); + } + + return p; +} + +static void s_stop(struct seq_file *m, void *p) +{ + struct trace_iterator *iter = m->private; + + atomic_dec(&trace_record_cmdline_disabled); + + /* let the tracer release locks here if needed */ + if (current_trace && current_trace == iter->trace && iter->trace->stop) + iter->trace->stop(iter); + + mutex_unlock(&trace_types_lock); +} + +#define KRETPROBE_MSG "[unknown/kretprobe'd]" + +#ifdef CONFIG_KRETPROBES +static inline int kretprobed(unsigned long addr) +{ + return addr == (unsigned long)kretprobe_trampoline; +} +#else +static inline int kretprobed(unsigned long addr) +{ + return 0; +} +#endif /* CONFIG_KRETPROBES */ + +static int +seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + + kallsyms_lookup(address, NULL, NULL, NULL, str); + + return trace_seq_printf(s, fmt, str); +#endif + return 1; +} + +static int +seq_print_sym_offset(struct trace_seq *s, const char *fmt, + unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + + sprint_symbol(str, address); + return trace_seq_printf(s, fmt, str); +#endif + return 1; +} + +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + +static int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) +{ + int ret; + + if (!ip) + return trace_seq_printf(s, "0"); + + if (sym_flags & TRACE_ITER_SYM_OFFSET) + ret = seq_print_sym_offset(s, "%s", ip); + else + ret = seq_print_sym_short(s, "%s", ip); + + if (!ret) + return 0; + + if (sym_flags & TRACE_ITER_SYM_ADDR) + ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + return ret; +} + +static void print_lat_help_header(struct seq_file *m) +{ + seq_puts(m, "# _------=> CPU# \n"); + seq_puts(m, "# / _-----=> irqs-off \n"); + seq_puts(m, "# | / _----=> need-resched \n"); + seq_puts(m, "# || / _---=> hardirq/softirq \n"); + seq_puts(m, "# ||| / _--=> preempt-depth \n"); + seq_puts(m, "# |||| / \n"); + seq_puts(m, "# ||||| delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); +} + +static void print_func_help_header(struct seq_file *m) +{ + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | | |\n"); +} + + +static void +print_trace_header(struct seq_file *m, struct trace_iterator *iter) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_array *tr = iter->tr; + struct trace_array_cpu *data = tr->data[tr->cpu]; + struct tracer *type = current_trace; + unsigned long total = 0; + unsigned long entries = 0; + int cpu; + const char *name = "preemption"; + + if (type) + name = type->name; + + for_each_tracing_cpu(cpu) { + if (head_page(tr->data[cpu])) { + total += tr->data[cpu]->trace_idx; + if (tr->data[cpu]->trace_idx > tr->entries) + entries += tr->entries; + else + entries += tr->data[cpu]->trace_idx; + } + } + + seq_printf(m, "%s latency trace v1.1.5 on %s\n", + name, UTS_RELEASE); + seq_puts(m, "-----------------------------------" + "---------------------------------\n"); + seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |" + " (M:%s VP:%d, KP:%d, SP:%d HP:%d", + nsecs_to_usecs(data->saved_latency), + entries, + total, + tr->cpu, +#if defined(CONFIG_PREEMPT_NONE) + "server", +#elif defined(CONFIG_PREEMPT_VOLUNTARY) + "desktop", +#elif defined(CONFIG_PREEMPT_DESKTOP) + "preempt", +#else + "unknown", +#endif + /* These are reserved for later use */ + 0, 0, 0, 0); +#ifdef CONFIG_SMP + seq_printf(m, " #P:%d)\n", num_online_cpus()); +#else + seq_puts(m, ")\n"); +#endif + seq_puts(m, " -----------------\n"); + seq_printf(m, " | task: %.16s-%d " + "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", + data->comm, data->pid, data->uid, data->nice, + data->policy, data->rt_priority); + seq_puts(m, " -----------------\n"); + + if (data->critical_start) { + seq_puts(m, " => started at: "); + seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); + trace_print_seq(m, &iter->seq); + seq_puts(m, "\n => ended at: "); + seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); + trace_print_seq(m, &iter->seq); + seq_puts(m, "\n"); + } + + seq_puts(m, "\n"); +} + +static void +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +{ + int hardirq, softirq; + char *comm; + + comm = trace_find_cmdline(entry->pid); + + trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); + trace_seq_printf(s, "%d", cpu); + trace_seq_printf(s, "%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) { + trace_seq_putc(s, 'H'); + } else { + if (hardirq) { + trace_seq_putc(s, 'h'); + } else { + if (softirq) + trace_seq_putc(s, 's'); + else + trace_seq_putc(s, '.'); + } + } + + if (entry->preempt_count) + trace_seq_printf(s, "%x", entry->preempt_count); + else + trace_seq_puts(s, "."); +} + +unsigned long preempt_mark_thresh = 100; + +static void +lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, + unsigned long rel_usecs) +{ + trace_seq_printf(s, " %4lldus", abs_usecs); + if (rel_usecs > preempt_mark_thresh) + trace_seq_puts(s, "!: "); + else if (rel_usecs > 1) + trace_seq_puts(s, "+: "); + else + trace_seq_puts(s, " : "); +} + +static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; + +static int +print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) +{ + struct trace_seq *s = &iter->seq; + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_entry *next_entry = find_next_entry(iter, NULL); + unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); + struct trace_entry *entry = iter->ent; + unsigned long abs_usecs; + unsigned long rel_usecs; + char *comm; + int S, T; + int i; + unsigned state; + + if (!next_entry) + next_entry = entry; + rel_usecs = ns2usecs(next_entry->t - entry->t); + abs_usecs = ns2usecs(entry->t - iter->tr->time_start); + + if (verbose) { + comm = trace_find_cmdline(entry->pid); + trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]" + " %ld.%03ldms (+%ld.%03ldms): ", + comm, + entry->pid, cpu, entry->flags, + entry->preempt_count, trace_idx, + ns2usecs(entry->t), + abs_usecs/1000, + abs_usecs % 1000, rel_usecs/1000, + rel_usecs % 1000); + } else { + lat_print_generic(s, entry, cpu); + lat_print_timestamp(s, abs_usecs, rel_usecs); + } + switch (entry->type) { + case TRACE_FN: + seq_print_ip_sym(s, entry->fn.ip, sym_flags); + trace_seq_puts(s, " ("); + if (kretprobed(entry->fn.parent_ip)) + trace_seq_puts(s, KRETPROBE_MSG); + else + seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); + trace_seq_puts(s, ")\n"); + break; + case TRACE_CTX: + case TRACE_WAKE: + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; + + state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0; + S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; + comm = trace_find_cmdline(entry->ctx.next_pid); + trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, entry->type == TRACE_CTX ? "==>" : " +", + entry->ctx.next_pid, + entry->ctx.next_prio, + T, comm); + break; + case TRACE_SPECIAL: + trace_seq_printf(s, "# %ld %ld %ld\n", + entry->special.arg1, + entry->special.arg2, + entry->special.arg3); + break; + case TRACE_STACK: + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (i) + trace_seq_puts(s, " <= "); + seq_print_ip_sym(s, entry->stack.caller[i], sym_flags); + } + trace_seq_puts(s, "\n"); + break; + default: + trace_seq_printf(s, "Unknown type %d\n", entry->type); + } + return 1; +} + +static int print_trace_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_entry *entry; + unsigned long usec_rem; + unsigned long long t; + unsigned long secs; + char *comm; + int ret; + int S, T; + int i; + + entry = iter->ent; + + comm = trace_find_cmdline(iter->ent->pid); + + t = ns2usecs(entry->t); + usec_rem = do_div(t, 1000000ULL); + secs = (unsigned long)t; + + ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + if (!ret) + return 0; + ret = trace_seq_printf(s, "[%02d] ", iter->cpu); + if (!ret) + return 0; + ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); + if (!ret) + return 0; + + switch (entry->type) { + case TRACE_FN: + ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags); + if (!ret) + return 0; + if ((sym_flags & TRACE_ITER_PRINT_PARENT) && + entry->fn.parent_ip) { + ret = trace_seq_printf(s, " <-"); + if (!ret) + return 0; + if (kretprobed(entry->fn.parent_ip)) + ret = trace_seq_puts(s, KRETPROBE_MSG); + else + ret = seq_print_ip_sym(s, entry->fn.parent_ip, + sym_flags); + if (!ret) + return 0; + } + ret = trace_seq_printf(s, "\n"); + if (!ret) + return 0; + break; + case TRACE_CTX: + case TRACE_WAKE: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; + ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->type == TRACE_CTX ? "==>" : " +", + entry->ctx.next_pid, + entry->ctx.next_prio, + T); + if (!ret) + return 0; + break; + case TRACE_SPECIAL: + ret = trace_seq_printf(s, "# %ld %ld %ld\n", + entry->special.arg1, + entry->special.arg2, + entry->special.arg3); + if (!ret) + return 0; + break; + case TRACE_STACK: + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (i) { + ret = trace_seq_puts(s, " <= "); + if (!ret) + return 0; + } + ret = seq_print_ip_sym(s, entry->stack.caller[i], + sym_flags); + if (!ret) + return 0; + } + ret = trace_seq_puts(s, "\n"); + if (!ret) + return 0; + break; + } + return 1; +} + +static int print_raw_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + int ret; + int S, T; + + entry = iter->ent; + + ret = trace_seq_printf(s, "%d %d %llu ", + entry->pid, iter->cpu, entry->t); + if (!ret) + return 0; + + switch (entry->type) { + case TRACE_FN: + ret = trace_seq_printf(s, "%x %x\n", + entry->fn.ip, entry->fn.parent_ip); + if (!ret) + return 0; + break; + case TRACE_CTX: + case TRACE_WAKE: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; + if (entry->type == TRACE_WAKE) + S = '+'; + ret = trace_seq_printf(s, "%d %d %c %d %d %c\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio, + T); + if (!ret) + return 0; + break; + case TRACE_SPECIAL: + case TRACE_STACK: + ret = trace_seq_printf(s, "# %ld %ld %ld\n", + entry->special.arg1, + entry->special.arg2, + entry->special.arg3); + if (!ret) + return 0; + break; + } + return 1; +} + +#define SEQ_PUT_FIELD_RET(s, x) \ +do { \ + if (!trace_seq_putmem(s, &(x), sizeof(x))) \ + return 0; \ +} while (0) + +#define SEQ_PUT_HEX_FIELD_RET(s, x) \ +do { \ + if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ + return 0; \ +} while (0) + +static int print_hex_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + unsigned char newline = '\n'; + struct trace_entry *entry; + int S, T; + + entry = iter->ent; + + SEQ_PUT_HEX_FIELD_RET(s, entry->pid); + SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); + SEQ_PUT_HEX_FIELD_RET(s, entry->t); + + switch (entry->type) { + case TRACE_FN: + SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip); + SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + break; + case TRACE_CTX: + case TRACE_WAKE: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; + if (entry->type == TRACE_WAKE) + S = '+'; + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio); + SEQ_PUT_HEX_FIELD_RET(s, S); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio); + SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + SEQ_PUT_HEX_FIELD_RET(s, T); + break; + case TRACE_SPECIAL: + case TRACE_STACK: + SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); + SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); + SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); + break; + } + SEQ_PUT_FIELD_RET(s, newline); + + return 1; +} + +static int print_bin_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + + entry = iter->ent; + + SEQ_PUT_FIELD_RET(s, entry->pid); + SEQ_PUT_FIELD_RET(s, entry->cpu); + SEQ_PUT_FIELD_RET(s, entry->t); + + switch (entry->type) { + case TRACE_FN: + SEQ_PUT_FIELD_RET(s, entry->fn.ip); + SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip); + break; + case TRACE_CTX: + SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid); + SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio); + SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state); + SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); + SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); + SEQ_PUT_FIELD_RET(s, entry->ctx.next_state); + break; + case TRACE_SPECIAL: + case TRACE_STACK: + SEQ_PUT_FIELD_RET(s, entry->special.arg1); + SEQ_PUT_FIELD_RET(s, entry->special.arg2); + SEQ_PUT_FIELD_RET(s, entry->special.arg3); + break; + } + return 1; +} + +static int trace_empty(struct trace_iterator *iter) +{ + struct trace_array_cpu *data; + int cpu; + + for_each_tracing_cpu(cpu) { + data = iter->tr->data[cpu]; + + if (head_page(data) && data->trace_idx && + (data->trace_tail != data->trace_head || + data->trace_tail_idx != data->trace_head_idx)) + return 0; + } + return 1; +} + +static int print_trace_line(struct trace_iterator *iter) +{ + if (iter->trace && iter->trace->print_line) + return iter->trace->print_line(iter); + + if (trace_flags & TRACE_ITER_BIN) + return print_bin_fmt(iter); + + if (trace_flags & TRACE_ITER_HEX) + return print_hex_fmt(iter); + + if (trace_flags & TRACE_ITER_RAW) + return print_raw_fmt(iter); + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + return print_lat_fmt(iter, iter->idx, iter->cpu); + + return print_trace_fmt(iter); +} + +static int s_show(struct seq_file *m, void *v) +{ + struct trace_iterator *iter = v; + + if (iter->ent == NULL) { + if (iter->tr) { + seq_printf(m, "# tracer: %s\n", iter->trace->name); + seq_puts(m, "#\n"); + } + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return 0; + print_trace_header(m, iter); + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); + } else { + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_func_help_header(m); + } + } else { + print_trace_line(iter); + trace_print_seq(m, &iter->seq); + } + + return 0; +} + +static struct seq_operations tracer_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static struct trace_iterator * +__tracing_open(struct inode *inode, struct file *file, int *ret) +{ + struct trace_iterator *iter; + + if (tracing_disabled) { + *ret = -ENODEV; + return NULL; + } + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + *ret = -ENOMEM; + goto out; + } + + mutex_lock(&trace_types_lock); + if (current_trace && current_trace->print_max) + iter->tr = &max_tr; + else + iter->tr = inode->i_private; + iter->trace = current_trace; + iter->pos = -1; + + /* TODO stop tracer */ + *ret = seq_open(file, &tracer_seq_ops); + if (!*ret) { + struct seq_file *m = file->private_data; + m->private = iter; + + /* stop the trace while dumping */ + if (iter->tr->ctrl) + tracer_enabled = 0; + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + } else { + kfree(iter); + iter = NULL; + } + mutex_unlock(&trace_types_lock); + + out: + return iter; +} + +int tracing_open_generic(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + filp->private_data = inode->i_private; + return 0; +} + +int tracing_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct trace_iterator *iter = m->private; + + mutex_lock(&trace_types_lock); + if (iter->trace && iter->trace->close) + iter->trace->close(iter); + + /* reenable tracing if it was previously enabled */ + if (iter->tr->ctrl) + tracer_enabled = 1; + mutex_unlock(&trace_types_lock); + + seq_release(inode, file); + kfree(iter); + return 0; +} + +static int tracing_open(struct inode *inode, struct file *file) +{ + int ret; + + __tracing_open(inode, file, &ret); + + return ret; +} + +static int tracing_lt_open(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter; + int ret; + + iter = __tracing_open(inode, file, &ret); + + if (!ret) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + + return ret; +} + + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct tracer *t = m->private; + + (*pos)++; + + if (t) + t = t->next; + + m->private = t; + + return t; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct tracer *t = m->private; + loff_t l = 0; + + mutex_lock(&trace_types_lock); + for (; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&trace_types_lock); +} + +static int t_show(struct seq_file *m, void *v) +{ + struct tracer *t = v; + + if (!t) + return 0; + + seq_printf(m, "%s", t->name); + if (t->next) + seq_putc(m, ' '); + else + seq_putc(m, '\n'); + + return 0; +} + +static struct seq_operations show_traces_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int show_traces_open(struct inode *inode, struct file *file) +{ + int ret; + + if (tracing_disabled) + return -ENODEV; + + ret = seq_open(file, &show_traces_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = trace_types; + } + + return ret; +} + +static struct file_operations tracing_fops = { + .open = tracing_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, +}; + +static struct file_operations tracing_lt_fops = { + .open = tracing_lt_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, +}; + +static struct file_operations show_traces_fops = { + .open = show_traces_open, + .read = seq_read, + .release = seq_release, +}; + +/* + * Only trace on a CPU if the bitmask is set: + */ +static cpumask_t tracing_cpumask = CPU_MASK_ALL; + +/* + * When tracing/tracing_cpu_mask is modified then this holds + * the new bitmask we are about to install: + */ +static cpumask_t tracing_cpumask_new; + +/* + * The tracer itself will not take this lock, but still we want + * to provide a consistent cpumask to user-space: + */ +static DEFINE_MUTEX(tracing_cpumask_update_lock); + +/* + * Temporary storage for the character representation of the + * CPU bitmask (and one more byte for the newline): + */ +static char mask_str[NR_CPUS + 1]; + +static ssize_t +tracing_cpumask_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + int len; + + mutex_lock(&tracing_cpumask_update_lock); + + len = cpumask_scnprintf(mask_str, count, tracing_cpumask); + if (count - len < 2) { + count = -EINVAL; + goto out_err; + } + len += sprintf(mask_str + len, "\n"); + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); + +out_err: + mutex_unlock(&tracing_cpumask_update_lock); + + return count; +} + +static ssize_t +tracing_cpumask_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + int err, cpu; + + mutex_lock(&tracing_cpumask_update_lock); + err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); + if (err) + goto err_unlock; + + raw_local_irq_disable(); + __raw_spin_lock(&ftrace_max_lock); + for_each_tracing_cpu(cpu) { + /* + * Increase/decrease the disabled counter if we are + * about to flip a bit in the cpumask: + */ + if (cpu_isset(cpu, tracing_cpumask) && + !cpu_isset(cpu, tracing_cpumask_new)) { + atomic_inc(&global_trace.data[cpu]->disabled); + } + if (!cpu_isset(cpu, tracing_cpumask) && + cpu_isset(cpu, tracing_cpumask_new)) { + atomic_dec(&global_trace.data[cpu]->disabled); + } + } + __raw_spin_unlock(&ftrace_max_lock); + raw_local_irq_enable(); + + tracing_cpumask = tracing_cpumask_new; + + mutex_unlock(&tracing_cpumask_update_lock); + + return count; + +err_unlock: + mutex_unlock(&tracing_cpumask_update_lock); + + return err; +} + +static struct file_operations tracing_cpumask_fops = { + .open = tracing_open_generic, + .read = tracing_cpumask_read, + .write = tracing_cpumask_write, +}; + +static ssize_t +tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf; + int r = 0; + int len = 0; + int i; + + /* calulate max size */ + for (i = 0; trace_options[i]; i++) { + len += strlen(trace_options[i]); + len += 3; /* "no" and space */ + } + + /* +2 for \n and \0 */ + buf = kmalloc(len + 2, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + for (i = 0; trace_options[i]; i++) { + if (trace_flags & (1 << i)) + r += sprintf(buf + r, "%s ", trace_options[i]); + else + r += sprintf(buf + r, "no%s ", trace_options[i]); + } + + r += sprintf(buf + r, "\n"); + WARN_ON(r >= len + 2); + + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + + kfree(buf); + + return r; +} + +static ssize_t +tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp = buf; + int neg = 0; + int i; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "no", 2) == 0) { + neg = 1; + cmp += 2; + } + + for (i = 0; trace_options[i]; i++) { + int len = strlen(trace_options[i]); + + if (strncmp(cmp, trace_options[i], len) == 0) { + if (neg) + trace_flags &= ~(1 << i); + else + trace_flags |= (1 << i); + break; + } + } + /* + * If no option could be set, return an error: + */ + if (!trace_options[i]) + return -EINVAL; + + filp->f_pos += cnt; + + return cnt; +} + +static struct file_operations tracing_iter_fops = { + .open = tracing_open_generic, + .read = tracing_iter_ctrl_read, + .write = tracing_iter_ctrl_write, +}; + +static const char readme_msg[] = + "tracing mini-HOWTO:\n\n" + "# mkdir /debug\n" + "# mount -t debugfs nodev /debug\n\n" + "# cat /debug/tracing/available_tracers\n" + "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n" + "# cat /debug/tracing/current_tracer\n" + "none\n" + "# echo sched_switch > /debug/tracing/current_tracer\n" + "# cat /debug/tracing/current_tracer\n" + "sched_switch\n" + "# cat /debug/tracing/iter_ctrl\n" + "noprint-parent nosym-offset nosym-addr noverbose\n" + "# echo print-parent > /debug/tracing/iter_ctrl\n" + "# echo 1 > /debug/tracing/tracing_enabled\n" + "# cat /debug/tracing/trace > /tmp/trace.txt\n" + "echo 0 > /debug/tracing/tracing_enabled\n" +; + +static ssize_t +tracing_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static struct file_operations tracing_readme_fops = { + .open = tracing_open_generic, + .read = tracing_readme_read, +}; + +static ssize_t +tracing_ctrl_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%ld\n", tr->ctrl); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_ctrl_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + + mutex_lock(&trace_types_lock); + if (tr->ctrl ^ val) { + if (val) + tracer_enabled = 1; + else + tracer_enabled = 0; + + tr->ctrl = val; + + if (current_trace && current_trace->ctrl_update) + current_trace->ctrl_update(tr); + } + mutex_unlock(&trace_types_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +tracing_set_trace_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[max_tracer_type_len+2]; + int r; + + mutex_lock(&trace_types_lock); + if (current_trace) + r = sprintf(buf, "%s\n", current_trace->name); + else + r = sprintf(buf, "\n"); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_set_trace_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = &global_trace; + struct tracer *t; + char buf[max_tracer_type_len+1]; + int i; + + if (cnt > max_tracer_type_len) + cnt = max_tracer_type_len; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* strip ending whitespace. */ + for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) + buf[i] = 0; + + mutex_lock(&trace_types_lock); + for (t = trace_types; t; t = t->next) { + if (strcmp(t->name, buf) == 0) + break; + } + if (!t || t == current_trace) + goto out; + + if (current_trace && current_trace->reset) + current_trace->reset(tr); + + current_trace = t; + if (t->init) + t->init(tr); + + out: + mutex_unlock(&trace_types_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *ptr = filp->private_data; + char buf[64]; + int r; + + r = snprintf(buf, sizeof(buf), "%ld\n", + *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); + if (r > sizeof(buf)) + r = sizeof(buf); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long *ptr = filp->private_data; + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + *ptr = val * 1000; + + return cnt; +} + +static atomic_t tracing_reader; + +static int tracing_open_pipe(struct inode *inode, struct file *filp) +{ + struct trace_iterator *iter; + + if (tracing_disabled) + return -ENODEV; + + /* We only allow for reader of the pipe */ + if (atomic_inc_return(&tracing_reader) != 1) { + atomic_dec(&tracing_reader); + return -EBUSY; + } + + /* create a buffer to store the information to pass to userspace */ + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + mutex_lock(&trace_types_lock); + iter->tr = &global_trace; + iter->trace = current_trace; + filp->private_data = iter; + + if (iter->trace->pipe_open) + iter->trace->pipe_open(iter); + mutex_unlock(&trace_types_lock); + + return 0; +} + +static int tracing_release_pipe(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter = file->private_data; + + kfree(iter); + atomic_dec(&tracing_reader); + + return 0; +} + +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table) +{ + struct trace_iterator *iter = filp->private_data; + + if (trace_flags & TRACE_ITER_BLOCK) { + /* + * Always select as readable when in blocking mode + */ + return POLLIN | POLLRDNORM; + } else { + if (!trace_empty(iter)) + return POLLIN | POLLRDNORM; + poll_wait(filp, &trace_wait, poll_table); + if (!trace_empty(iter)) + return POLLIN | POLLRDNORM; + + return 0; + } +} + +/* + * Consumer reader. + */ +static ssize_t +tracing_read_pipe(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_iterator *iter = filp->private_data; + struct trace_array_cpu *data; + static cpumask_t mask; + unsigned long flags; +#ifdef CONFIG_FTRACE + int ftrace_save; +#endif + int cpu; + ssize_t sret; + + /* return any leftover data */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (sret != -EBUSY) + return sret; + sret = 0; + + trace_seq_reset(&iter->seq); + + mutex_lock(&trace_types_lock); + if (iter->trace->read) { + sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); + if (sret) + goto out; + } + + while (trace_empty(iter)) { + + if ((filp->f_flags & O_NONBLOCK)) { + sret = -EAGAIN; + goto out; + } + + /* + * This is a make-shift waitqueue. The reason we don't use + * an actual wait queue is because: + * 1) we only ever have one waiter + * 2) the tracing, traces all functions, we don't want + * the overhead of calling wake_up and friends + * (and tracing them too) + * Anyway, this is really very primitive wakeup. + */ + set_current_state(TASK_INTERRUPTIBLE); + iter->tr->waiter = current; + + mutex_unlock(&trace_types_lock); + + /* sleep for 100 msecs, and try again. */ + schedule_timeout(HZ/10); + + mutex_lock(&trace_types_lock); + + iter->tr->waiter = NULL; + + if (signal_pending(current)) { + sret = -EINTR; + goto out; + } + + if (iter->trace != current_trace) + goto out; + + /* + * We block until we read something and tracing is disabled. + * We still block if tracing is disabled, but we have never + * read anything. This allows a user to cat this file, and + * then enable tracing. But after we have read something, + * we give an EOF when tracing is again disabled. + * + * iter->pos will be 0 if we haven't read anything. + */ + if (!tracer_enabled && iter->pos) + break; + + continue; + } + + /* stop when tracing is finished */ + if (trace_empty(iter)) + goto out; + + if (cnt >= PAGE_SIZE) + cnt = PAGE_SIZE - 1; + + /* reset all but tr, trace, and overruns */ + memset(&iter->seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter->pos = -1; + + /* + * We need to stop all tracing on all CPUS to read the + * the next buffer. This is a bit expensive, but is + * not done often. We fill all what we can read, + * and then release the locks again. + */ + + cpus_clear(mask); + local_irq_save(flags); +#ifdef CONFIG_FTRACE + ftrace_save = ftrace_enabled; + ftrace_enabled = 0; +#endif + smp_wmb(); + for_each_tracing_cpu(cpu) { + data = iter->tr->data[cpu]; + + if (!head_page(data) || !data->trace_idx) + continue; + + atomic_inc(&data->disabled); + cpu_set(cpu, mask); + } + + for_each_cpu_mask(cpu, mask) { + data = iter->tr->data[cpu]; + __raw_spin_lock(&data->lock); + + if (data->overrun > iter->last_overrun[cpu]) + iter->overrun[cpu] += + data->overrun - iter->last_overrun[cpu]; + iter->last_overrun[cpu] = data->overrun; + } + + while (find_next_entry_inc(iter) != NULL) { + int ret; + int len = iter->seq.len; + + ret = print_trace_line(iter); + if (!ret) { + /* don't print partial lines */ + iter->seq.len = len; + break; + } + + trace_consume(iter); + + if (iter->seq.len >= cnt) + break; + } + + for_each_cpu_mask(cpu, mask) { + data = iter->tr->data[cpu]; + __raw_spin_unlock(&data->lock); + } + + for_each_cpu_mask(cpu, mask) { + data = iter->tr->data[cpu]; + atomic_dec(&data->disabled); + } +#ifdef CONFIG_FTRACE + ftrace_enabled = ftrace_save; +#endif + local_irq_restore(flags); + + /* Now copy what we have to the user */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (iter->seq.readpos >= iter->seq.len) + trace_seq_reset(&iter->seq); + if (sret == -EBUSY) + sret = 0; + +out: + mutex_unlock(&trace_types_lock); + + return sret; +} + +static ssize_t +tracing_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%lu\n", tr->entries); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_entries_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + char buf[64]; + int i, ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + /* must have at least 1 entry */ + if (!val) + return -EINVAL; + + mutex_lock(&trace_types_lock); + + if (current_trace != &no_tracer) { + cnt = -EBUSY; + pr_info("ftrace: set current_tracer to none" + " before modifying buffer size\n"); + goto out; + } + + if (val > global_trace.entries) { + long pages_requested; + unsigned long freeable_pages; + + /* make sure we have enough memory before mapping */ + pages_requested = + (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE; + + /* account for each buffer (and max_tr) */ + pages_requested *= tracing_nr_buffers * 2; + + /* Check for overflow */ + if (pages_requested < 0) { + cnt = -ENOMEM; + goto out; + } + + freeable_pages = determine_dirtyable_memory(); + + /* we only allow to request 1/4 of useable memory */ + if (pages_requested > + ((freeable_pages + tracing_pages_allocated) / 4)) { + cnt = -ENOMEM; + goto out; + } + + while (global_trace.entries < val) { + if (trace_alloc_page()) { + cnt = -ENOMEM; + goto out; + } + /* double check that we don't go over the known pages */ + if (tracing_pages_allocated > pages_requested) + break; + } + + } else { + /* include the number of entries in val (inc of page entries) */ + while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1)) + trace_free_page(); + } + + /* check integrity */ + for_each_tracing_cpu(i) + check_pages(global_trace.data[i]); + + filp->f_pos += cnt; + + /* If check pages failed, return ENOMEM */ + if (tracing_disabled) + cnt = -ENOMEM; + out: + max_tr.entries = global_trace.entries; + mutex_unlock(&trace_types_lock); + + return cnt; +} + +static struct file_operations tracing_max_lat_fops = { + .open = tracing_open_generic, + .read = tracing_max_lat_read, + .write = tracing_max_lat_write, +}; + +static struct file_operations tracing_ctrl_fops = { + .open = tracing_open_generic, + .read = tracing_ctrl_read, + .write = tracing_ctrl_write, +}; + +static struct file_operations set_tracer_fops = { + .open = tracing_open_generic, + .read = tracing_set_trace_read, + .write = tracing_set_trace_write, +}; + +static struct file_operations tracing_pipe_fops = { + .open = tracing_open_pipe, + .poll = tracing_poll_pipe, + .read = tracing_read_pipe, + .release = tracing_release_pipe, +}; + +static struct file_operations tracing_entries_fops = { + .open = tracing_open_generic, + .read = tracing_entries_read, + .write = tracing_entries_write, +}; + +#ifdef CONFIG_DYNAMIC_FTRACE + +static ssize_t +tracing_read_long(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *p = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%ld\n", *p); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static struct file_operations tracing_read_long_fops = { + .open = tracing_open_generic, + .read = tracing_read_long, +}; +#endif + +static struct dentry *d_tracer; + +struct dentry *tracing_init_dentry(void) +{ + static int once; + + if (d_tracer) + return d_tracer; + + d_tracer = debugfs_create_dir("tracing", NULL); + + if (!d_tracer && !once) { + once = 1; + pr_warning("Could not create debugfs directory 'tracing'\n"); + return NULL; + } + + return d_tracer; +} + +#ifdef CONFIG_FTRACE_SELFTEST +/* Let selftest have access to static functions in this file */ +#include "trace_selftest.c" +#endif + +static __init void tracer_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, + &global_trace, &tracing_ctrl_fops); + if (!entry) + pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); + + entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, + NULL, &tracing_iter_fops); + if (!entry) + pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); + + entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, + NULL, &tracing_cpumask_fops); + if (!entry) + pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); + + entry = debugfs_create_file("latency_trace", 0444, d_tracer, + &global_trace, &tracing_lt_fops); + if (!entry) + pr_warning("Could not create debugfs 'latency_trace' entry\n"); + + entry = debugfs_create_file("trace", 0444, d_tracer, + &global_trace, &tracing_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("available_tracers", 0444, d_tracer, + &global_trace, &show_traces_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("current_tracer", 0444, d_tracer, + &global_trace, &set_tracer_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, + &tracing_max_latency, + &tracing_max_lat_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_max_latency' entry\n"); + + entry = debugfs_create_file("tracing_thresh", 0644, d_tracer, + &tracing_thresh, &tracing_max_lat_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); + entry = debugfs_create_file("README", 0644, d_tracer, + NULL, &tracing_readme_fops); + if (!entry) + pr_warning("Could not create debugfs 'README' entry\n"); + + entry = debugfs_create_file("trace_pipe", 0644, d_tracer, + NULL, &tracing_pipe_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); + + entry = debugfs_create_file("trace_entries", 0644, d_tracer, + &global_trace, &tracing_entries_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); + +#ifdef CONFIG_DYNAMIC_FTRACE + entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, + &ftrace_update_tot_cnt, + &tracing_read_long_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'dyn_ftrace_total_info' entry\n"); +#endif +#ifdef CONFIG_SYSPROF_TRACER + init_tracer_sysprof_debugfs(d_tracer); +#endif +} + +static int trace_alloc_page(void) +{ + struct trace_array_cpu *data; + struct page *page, *tmp; + LIST_HEAD(pages); + void *array; + unsigned pages_allocated = 0; + int i; + + /* first allocate a page for each CPU */ + for_each_tracing_cpu(i) { + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_pages; + } + + pages_allocated++; + page = virt_to_page(array); + list_add(&page->lru, &pages); + +/* Only allocate if we are actually using the max trace */ +#ifdef CONFIG_TRACER_MAX_TRACE + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_pages; + } + pages_allocated++; + page = virt_to_page(array); + list_add(&page->lru, &pages); +#endif + } + + /* Now that we successfully allocate a page per CPU, add them */ + for_each_tracing_cpu(i) { + data = global_trace.data[i]; + page = list_entry(pages.next, struct page, lru); + list_del_init(&page->lru); + list_add_tail(&page->lru, &data->trace_pages); + ClearPageLRU(page); + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + page = list_entry(pages.next, struct page, lru); + list_del_init(&page->lru); + list_add_tail(&page->lru, &data->trace_pages); + SetPageLRU(page); +#endif + } + tracing_pages_allocated += pages_allocated; + global_trace.entries += ENTRIES_PER_PAGE; + + return 0; + + free_pages: + list_for_each_entry_safe(page, tmp, &pages, lru) { + list_del_init(&page->lru); + __free_page(page); + } + return -ENOMEM; +} + +static int trace_free_page(void) +{ + struct trace_array_cpu *data; + struct page *page; + struct list_head *p; + int i; + int ret = 0; + + /* free one page from each buffer */ + for_each_tracing_cpu(i) { + data = global_trace.data[i]; + p = data->trace_pages.next; + if (p == &data->trace_pages) { + /* should never happen */ + WARN_ON(1); + tracing_disabled = 1; + ret = -1; + break; + } + page = list_entry(p, struct page, lru); + ClearPageLRU(page); + list_del(&page->lru); + tracing_pages_allocated--; + tracing_pages_allocated--; + __free_page(page); + + tracing_reset(data); + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + p = data->trace_pages.next; + if (p == &data->trace_pages) { + /* should never happen */ + WARN_ON(1); + tracing_disabled = 1; + ret = -1; + break; + } + page = list_entry(p, struct page, lru); + ClearPageLRU(page); + list_del(&page->lru); + __free_page(page); + + tracing_reset(data); +#endif + } + global_trace.entries -= ENTRIES_PER_PAGE; + + return ret; +} + +__init static int tracer_alloc_buffers(void) +{ + struct trace_array_cpu *data; + void *array; + struct page *page; + int pages = 0; + int ret = -ENOMEM; + int i; + + /* TODO: make the number of buffers hot pluggable with CPUS */ + tracing_nr_buffers = num_possible_cpus(); + tracing_buffer_mask = cpu_possible_map; + + /* Allocate the first page for all buffers */ + for_each_tracing_cpu(i) { + data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); + max_tr.data[i] = &per_cpu(max_data, i); + + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_buffers; + } + + /* set the array to the list */ + INIT_LIST_HEAD(&data->trace_pages); + page = virt_to_page(array); + list_add(&page->lru, &data->trace_pages); + /* use the LRU flag to differentiate the two buffers */ + ClearPageLRU(page); + + data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +/* Only allocate if we are actually using the max trace */ +#ifdef CONFIG_TRACER_MAX_TRACE + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_buffers; + } + + INIT_LIST_HEAD(&max_tr.data[i]->trace_pages); + page = virt_to_page(array); + list_add(&page->lru, &max_tr.data[i]->trace_pages); + SetPageLRU(page); +#endif + } + + /* + * Since we allocate by orders of pages, we may be able to + * round up a bit. + */ + global_trace.entries = ENTRIES_PER_PAGE; + pages++; + + while (global_trace.entries < trace_nr_entries) { + if (trace_alloc_page()) + break; + pages++; + } + max_tr.entries = global_trace.entries; + + pr_info("tracer: %d pages allocated for %ld", + pages, trace_nr_entries); + pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE); + pr_info(" actual entries %ld\n", global_trace.entries); + + tracer_init_debugfs(); + + trace_init_cmdlines(); + + register_tracer(&no_tracer); + current_trace = &no_tracer; + + /* All seems OK, enable tracing */ + global_trace.ctrl = tracer_enabled; + tracing_disabled = 0; + + return 0; + + free_buffers: + for (i-- ; i >= 0; i--) { + struct page *page, *tmp; + struct trace_array_cpu *data = global_trace.data[i]; + + if (data) { + list_for_each_entry_safe(page, tmp, + &data->trace_pages, lru) { + list_del_init(&page->lru); + __free_page(page); + } + } + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + if (data) { + list_for_each_entry_safe(page, tmp, + &data->trace_pages, lru) { + list_del_init(&page->lru); + __free_page(page); + } + } +#endif + } + return ret; +} +fs_initcall(tracer_alloc_buffers); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h new file mode 100644 index 000000000000..8cb215b239d5 --- /dev/null +++ b/kernel/trace/trace.h @@ -0,0 +1,333 @@ +#ifndef _LINUX_KERNEL_TRACE_H +#define _LINUX_KERNEL_TRACE_H + +#include <linux/fs.h> +#include <asm/atomic.h> +#include <linux/sched.h> +#include <linux/clocksource.h> +#include <linux/mmiotrace.h> + +enum trace_type { + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_CTX, + TRACE_WAKE, + TRACE_STACK, + TRACE_SPECIAL, + TRACE_MMIO_RW, + TRACE_MMIO_MAP, + + __TRACE_LAST_TYPE +}; + +/* + * Function trace entry - function address and parent function addres: + */ +struct ftrace_entry { + unsigned long ip; + unsigned long parent_ip; +}; + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + */ +struct ctx_switch_entry { + unsigned int prev_pid; + unsigned char prev_prio; + unsigned char prev_state; + unsigned int next_pid; + unsigned char next_prio; + unsigned char next_state; +}; + +/* + * Special (free-form) trace entry: + */ +struct special_entry { + unsigned long arg1; + unsigned long arg2; + unsigned long arg3; +}; + +/* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES 8 + +struct stack_entry { + unsigned long caller[FTRACE_STACK_ENTRIES]; +}; + +/* + * The trace entry - the most basic unit of tracing. This is what + * is printed in the end as a single line in the trace output, such as: + * + * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + */ +struct trace_entry { + char type; + char cpu; + char flags; + char preempt_count; + int pid; + cycle_t t; + union { + struct ftrace_entry fn; + struct ctx_switch_entry ctx; + struct special_entry special; + struct stack_entry stack; + struct mmiotrace_rw mmiorw; + struct mmiotrace_map mmiomap; + }; +}; + +#define TRACE_ENTRY_SIZE sizeof(struct trace_entry) + +/* + * The CPU trace array - it consists of thousands of trace entries + * plus some other descriptor data: (for example which task started + * the trace, etc.) + */ +struct trace_array_cpu { + struct list_head trace_pages; + atomic_t disabled; + raw_spinlock_t lock; + struct lock_class_key lock_key; + + /* these fields get copied into max-trace: */ + unsigned trace_head_idx; + unsigned trace_tail_idx; + void *trace_head; /* producer */ + void *trace_tail; /* consumer */ + unsigned long trace_idx; + unsigned long overrun; + unsigned long saved_latency; + unsigned long critical_start; + unsigned long critical_end; + unsigned long critical_sequence; + unsigned long nice; + unsigned long policy; + unsigned long rt_priority; + cycle_t preempt_timestamp; + pid_t pid; + uid_t uid; + char comm[TASK_COMM_LEN]; +}; + +struct trace_iterator; + +/* + * The trace array - an array of per-CPU trace arrays. This is the + * highest level data structure that individual tracers deal with. + * They have on/off state as well: + */ +struct trace_array { + unsigned long entries; + long ctrl; + int cpu; + cycle_t time_start; + struct task_struct *waiter; + struct trace_array_cpu *data[NR_CPUS]; +}; + +/* + * A specific tracer, represented by methods that operate on a trace array: + */ +struct tracer { + const char *name; + void (*init)(struct trace_array *tr); + void (*reset)(struct trace_array *tr); + void (*open)(struct trace_iterator *iter); + void (*pipe_open)(struct trace_iterator *iter); + void (*close)(struct trace_iterator *iter); + void (*start)(struct trace_iterator *iter); + void (*stop)(struct trace_iterator *iter); + ssize_t (*read)(struct trace_iterator *iter, + struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos); + void (*ctrl_update)(struct trace_array *tr); +#ifdef CONFIG_FTRACE_STARTUP_TEST + int (*selftest)(struct tracer *trace, + struct trace_array *tr); +#endif + int (*print_line)(struct trace_iterator *iter); + struct tracer *next; + int print_max; +}; + +struct trace_seq { + unsigned char buffer[PAGE_SIZE]; + unsigned int len; + unsigned int readpos; +}; + +/* + * Trace iterator - used by printout routines who present trace + * results to users and which routines might sleep, etc: + */ +struct trace_iterator { + struct trace_array *tr; + struct tracer *trace; + void *private; + long last_overrun[NR_CPUS]; + long overrun[NR_CPUS]; + + /* The below is zeroed out in pipe_read */ + struct trace_seq seq; + struct trace_entry *ent; + int cpu; + + struct trace_entry *prev_ent; + int prev_cpu; + + unsigned long iter_flags; + loff_t pos; + unsigned long next_idx[NR_CPUS]; + struct list_head *next_page[NR_CPUS]; + unsigned next_page_idx[NR_CPUS]; + long idx; +}; + +void tracing_reset(struct trace_array_cpu *data); +int tracing_open_generic(struct inode *inode, struct file *filp); +struct dentry *tracing_init_dentry(void); +void init_tracer_sysprof_debugfs(struct dentry *d_tracer); + +void ftrace(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags); +void tracing_sched_switch_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags); +void tracing_record_cmdline(struct task_struct *tsk); + +void tracing_sched_wakeup_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *wakee, + struct task_struct *cur, + unsigned long flags); +void trace_special(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3); +void trace_function(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags); + +void tracing_start_function_trace(void); +void tracing_stop_function_trace(void); +void tracing_start_cmdline_record(void); +void tracing_stop_cmdline_record(void); +int register_tracer(struct tracer *type); +void unregister_tracer(struct tracer *type); + +extern unsigned long nsecs_to_usecs(unsigned long nsecs); + +extern unsigned long tracing_max_latency; +extern unsigned long tracing_thresh; + +void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); +void update_max_tr_single(struct trace_array *tr, + struct task_struct *tsk, int cpu); + +extern cycle_t ftrace_now(int cpu); + +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +typedef void +(*tracer_switch_func_t)(void *private, + void *__rq, + struct task_struct *prev, + struct task_struct *next); + +struct tracer_switch_ops { + tracer_switch_func_t func; + void *private; + struct tracer_switch_ops *next; +}; + +#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + +#ifdef CONFIG_DYNAMIC_FTRACE +extern unsigned long ftrace_update_tot_cnt; +#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func +extern int DYN_FTRACE_TEST_NAME(void); +#endif + +#ifdef CONFIG_MMIOTRACE +extern void __trace_mmiotrace_rw(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_rw *rw); +extern void __trace_mmiotrace_map(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_map *map); +#endif + +#ifdef CONFIG_FTRACE_STARTUP_TEST +#ifdef CONFIG_FTRACE +extern int trace_selftest_startup_function(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_IRQSOFF_TRACER +extern int trace_selftest_startup_irqsoff(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_PREEMPT_TRACER +extern int trace_selftest_startup_preemptoff(struct tracer *trace, + struct trace_array *tr); +#endif +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) +extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_SCHED_TRACER +extern int trace_selftest_startup_wakeup(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +extern int trace_selftest_startup_sched_switch(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_SYSPROF_TRACER +extern int trace_selftest_startup_sysprof(struct tracer *trace, + struct trace_array *tr); +#endif +#endif /* CONFIG_FTRACE_STARTUP_TEST */ + +extern void *head_page(struct trace_array_cpu *data); +extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); +extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, + size_t cnt); +extern long ns2usecs(cycle_t nsec); + +extern unsigned long trace_flags; + +/* + * trace_iterator_flags is an enumeration that defines bit + * positions into trace_flags that controls the output. + * + * NOTE: These bits must match the trace_options array in + * trace.c. + */ +enum trace_iterator_flags { + TRACE_ITER_PRINT_PARENT = 0x01, + TRACE_ITER_SYM_OFFSET = 0x02, + TRACE_ITER_SYM_ADDR = 0x04, + TRACE_ITER_VERBOSE = 0x08, + TRACE_ITER_RAW = 0x10, + TRACE_ITER_HEX = 0x20, + TRACE_ITER_BIN = 0x40, + TRACE_ITER_BLOCK = 0x80, + TRACE_ITER_STACKTRACE = 0x100, + TRACE_ITER_SCHED_TREE = 0x200, +}; + +#endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c new file mode 100644 index 000000000000..7ee7dcd76b7d --- /dev/null +++ b/kernel/trace/trace_functions.c @@ -0,0 +1,78 @@ +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/fs.h> + +#include "trace.h" + +static void function_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static void start_function_trace(struct trace_array *tr) +{ + function_reset(tr); + tracing_start_cmdline_record(); + tracing_start_function_trace(); +} + +static void stop_function_trace(struct trace_array *tr) +{ + tracing_stop_function_trace(); + tracing_stop_cmdline_record(); +} + +static void function_trace_init(struct trace_array *tr) +{ + if (tr->ctrl) + start_function_trace(tr); +} + +static void function_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_function_trace(tr); +} + +static void function_trace_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_function_trace(tr); + else + stop_function_trace(tr); +} + +static struct tracer function_trace __read_mostly = +{ + .name = "ftrace", + .init = function_trace_init, + .reset = function_trace_reset, + .ctrl_update = function_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_function, +#endif +}; + +static __init int init_function_trace(void) +{ + return register_tracer(&function_trace); +} + +device_initcall(init_function_trace); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c new file mode 100644 index 000000000000..421d6fe3650e --- /dev/null +++ b/kernel/trace/trace_irqsoff.c @@ -0,0 +1,486 @@ +/* + * trace irqs off criticall timings + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + * From code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include <linux/kallsyms.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ftrace.h> +#include <linux/fs.h> + +#include "trace.h" + +static struct trace_array *irqsoff_trace __read_mostly; +static int tracer_enabled __read_mostly; + +static DEFINE_PER_CPU(int, tracing_cpu); + +static DEFINE_SPINLOCK(max_trace_lock); + +enum { + TRACER_IRQS_OFF = (1 << 1), + TRACER_PREEMPT_OFF = (1 << 2), +}; + +static int trace_type __read_mostly; + +#ifdef CONFIG_PREEMPT_TRACER +static inline int +preempt_trace(void) +{ + return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); +} +#else +# define preempt_trace() (0) +#endif + +#ifdef CONFIG_IRQSOFF_TRACER +static inline int +irq_trace(void) +{ + return ((trace_type & TRACER_IRQS_OFF) && + irqs_disabled()); +} +#else +# define irq_trace() (0) +#endif + +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesnt + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp unsigned long max_sequence; + +#ifdef CONFIG_FTRACE +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + /* + * Does not matter if we preempt. We test the flags + * afterward, to see if irqs are disabled or not. + * If we preempt and get a false positive, the flags + * test will fail. + */ + cpu = raw_smp_processor_id(); + if (likely(!per_cpu(tracing_cpu, cpu))) + return; + + local_save_flags(flags); + /* slight chance to get a false positive on tracing_cpu */ + if (!irqs_disabled_flags(flags)) + return; + + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + trace_function(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = irqsoff_tracer_call, +}; +#endif /* CONFIG_FTRACE */ + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +static void +check_critical_timing(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long parent_ip, + int cpu) +{ + unsigned long latency, t0, t1; + cycle_t T0, T1, delta; + unsigned long flags; + + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); + delta = T1-T0; + + local_save_flags(flags); + + if (!report_latency(delta)) + goto out; + + spin_lock_irqsave(&max_trace_lock, flags); + + /* check if we are still the max latency */ + if (!report_latency(delta)) + goto out_unlock; + + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); + + latency = nsecs_to_usecs(delta); + + if (data->critical_sequence != max_sequence) + goto out_unlock; + + tracing_max_latency = delta; + t0 = nsecs_to_usecs(T0); + t1 = nsecs_to_usecs(T1); + + data->critical_end = parent_ip; + + update_max_tr_single(tr, current, cpu); + + max_sequence++; + +out_unlock: + spin_unlock_irqrestore(&max_trace_lock, flags); + +out: + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + tracing_reset(data); + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); +} + +static inline void +start_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (likely(!tracer_enabled)) + return; + + cpu = raw_smp_processor_id(); + + if (per_cpu(tracing_cpu, cpu)) + return; + + data = tr->data[cpu]; + + if (unlikely(!data) || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + data->critical_start = parent_ip ? : ip; + tracing_reset(data); + + local_save_flags(flags); + + trace_function(tr, data, ip, parent_ip, flags); + + per_cpu(tracing_cpu, cpu) = 1; + + atomic_dec(&data->disabled); +} + +static inline void +stop_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + cpu = raw_smp_processor_id(); + /* Always clear the tracing cpu on stopping the trace */ + if (unlikely(per_cpu(tracing_cpu, cpu))) + per_cpu(tracing_cpu, cpu) = 0; + else + return; + + if (!tracer_enabled) + return; + + data = tr->data[cpu]; + + if (unlikely(!data) || unlikely(!head_page(data)) || + !data->critical_start || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + local_save_flags(flags); + trace_function(tr, data, ip, parent_ip, flags); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); + data->critical_start = 0; + atomic_dec(&data->disabled); +} + +/* start and stop critical timings used to for stoppage (in idle) */ +void start_critical_timings(void) +{ + if (preempt_trace() || irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +void stop_critical_timings(void) +{ + if (preempt_trace() || irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +#ifdef CONFIG_IRQSOFF_TRACER +#ifdef CONFIG_PROVE_LOCKING +void time_hardirqs_on(unsigned long a0, unsigned long a1) +{ + if (!preempt_trace() && irq_trace()) + stop_critical_timing(a0, a1); +} + +void time_hardirqs_off(unsigned long a0, unsigned long a1) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(a0, a1); +} + +#else /* !CONFIG_PROVE_LOCKING */ + +/* + * Stubs: + */ + +void early_boot_irqs_off(void) +{ +} + +void early_boot_irqs_on(void) +{ +} + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} + +/* + * We are only interested in hardirq on/off events: + */ +void trace_hardirqs_on(void) +{ + if (!preempt_trace() && irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void trace_hardirqs_off(void) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +void trace_hardirqs_on_caller(unsigned long caller_addr) +{ + if (!preempt_trace() && irq_trace()) + stop_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +void trace_hardirqs_off_caller(unsigned long caller_addr) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +#endif /* CONFIG_PROVE_LOCKING */ +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +void trace_preempt_on(unsigned long a0, unsigned long a1) +{ + stop_critical_timing(a0, a1); +} + +void trace_preempt_off(unsigned long a0, unsigned long a1) +{ + start_critical_timing(a0, a1); +} +#endif /* CONFIG_PREEMPT_TRACER */ + +static void start_irqsoff_tracer(struct trace_array *tr) +{ + register_ftrace_function(&trace_ops); + tracer_enabled = 1; +} + +static void stop_irqsoff_tracer(struct trace_array *tr) +{ + tracer_enabled = 0; + unregister_ftrace_function(&trace_ops); +} + +static void __irqsoff_tracer_init(struct trace_array *tr) +{ + irqsoff_trace = tr; + /* make sure that the tracer is visible */ + smp_wmb(); + + if (tr->ctrl) + start_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_irqsoff_tracer(tr); + else + stop_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_open(struct trace_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + stop_irqsoff_tracer(iter->tr); +} + +static void irqsoff_tracer_close(struct trace_iterator *iter) +{ + if (iter->tr->ctrl) + start_irqsoff_tracer(iter->tr); +} + +#ifdef CONFIG_IRQSOFF_TRACER +static void irqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF; + + __irqsoff_tracer_init(tr); +} +static struct tracer irqsoff_tracer __read_mostly = +{ + .name = "irqsoff", + .init = irqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_irqsoff, +#endif +}; +# define register_irqsoff(trace) register_tracer(&trace) +#else +# define register_irqsoff(trace) do { } while (0) +#endif + +#ifdef CONFIG_PREEMPT_TRACER +static void preemptoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); +} + +static struct tracer preemptoff_tracer __read_mostly = +{ + .name = "preemptoff", + .init = preemptoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptoff, +#endif +}; +# define register_preemptoff(trace) register_tracer(&trace) +#else +# define register_preemptoff(trace) do { } while (0) +#endif + +#if defined(CONFIG_IRQSOFF_TRACER) && \ + defined(CONFIG_PREEMPT_TRACER) + +static void preemptirqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); +} + +static struct tracer preemptirqsoff_tracer __read_mostly = +{ + .name = "preemptirqsoff", + .init = preemptirqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptirqsoff, +#endif +}; + +# define register_preemptirqsoff(trace) register_tracer(&trace) +#else +# define register_preemptirqsoff(trace) do { } while (0) +#endif + +__init static int init_irqsoff_tracer(void) +{ + register_irqsoff(irqsoff_tracer); + register_preemptoff(preemptoff_tracer); + register_preemptirqsoff(preemptirqsoff_tracer); + + return 0; +} +device_initcall(init_irqsoff_tracer); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c new file mode 100644 index 000000000000..b13dc19dcbb4 --- /dev/null +++ b/kernel/trace/trace_mmiotrace.c @@ -0,0 +1,295 @@ +/* + * Memory mapped I/O tracing + * + * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi> + */ + +#define DEBUG 1 + +#include <linux/kernel.h> +#include <linux/mmiotrace.h> +#include <linux/pci.h> + +#include "trace.h" + +struct header_iter { + struct pci_dev *dev; +}; + +static struct trace_array *mmio_trace_array; +static bool overrun_detected; + +static void mmio_reset_data(struct trace_array *tr) +{ + int cpu; + + overrun_detected = false; + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static void mmio_trace_init(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + mmio_trace_array = tr; + if (tr->ctrl) { + mmio_reset_data(tr); + enable_mmiotrace(); + } +} + +static void mmio_trace_reset(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + if (tr->ctrl) + disable_mmiotrace(); + mmio_reset_data(tr); + mmio_trace_array = NULL; +} + +static void mmio_trace_ctrl_update(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + if (tr->ctrl) { + mmio_reset_data(tr); + enable_mmiotrace(); + } else { + disable_mmiotrace(); + } +} + +static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) +{ + int ret = 0; + int i; + resource_size_t start, end; + const struct pci_driver *drv = pci_dev_driver(dev); + + /* XXX: incomplete checks for trace_seq_printf() return value */ + ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", + dev->bus->number, dev->devfn, + dev->vendor, dev->device, dev->irq); + /* + * XXX: is pci_resource_to_user() appropriate, since we are + * supposed to interpret the __ioremap() phys_addr argument based on + * these printed values? + */ + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + ret += trace_seq_printf(s, " %llx", + (unsigned long long)(start | + (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); + } + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + ret += trace_seq_printf(s, " %llx", + dev->resource[i].start < dev->resource[i].end ? + (unsigned long long)(end - start) + 1 : 0); + } + if (drv) + ret += trace_seq_printf(s, " %s\n", drv->name); + else + ret += trace_seq_printf(s, " \n"); + return ret; +} + +static void destroy_header_iter(struct header_iter *hiter) +{ + if (!hiter) + return; + pci_dev_put(hiter->dev); + kfree(hiter); +} + +static void mmio_pipe_open(struct trace_iterator *iter) +{ + struct header_iter *hiter; + struct trace_seq *s = &iter->seq; + + trace_seq_printf(s, "VERSION 20070824\n"); + + hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); + if (!hiter) + return; + + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL); + iter->private = hiter; +} + +/* XXX: This is not called when the pipe is closed! */ +static void mmio_close(struct trace_iterator *iter) +{ + struct header_iter *hiter = iter->private; + destroy_header_iter(hiter); + iter->private = NULL; +} + +static unsigned long count_overruns(struct trace_iterator *iter) +{ + int cpu; + unsigned long cnt = 0; + for_each_online_cpu(cpu) { + cnt += iter->overrun[cpu]; + iter->overrun[cpu] = 0; + } + return cnt; +} + +static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, + char __user *ubuf, size_t cnt, loff_t *ppos) +{ + ssize_t ret; + struct header_iter *hiter = iter->private; + struct trace_seq *s = &iter->seq; + unsigned long n; + + n = count_overruns(iter); + if (n) { + /* XXX: This is later than where events were lost. */ + trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); + if (!overrun_detected) + pr_warning("mmiotrace has lost events.\n"); + overrun_detected = true; + goto print_out; + } + + if (!hiter) + return 0; + + mmio_print_pcidev(s, hiter->dev); + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev); + + if (!hiter->dev) { + destroy_header_iter(hiter); + iter->private = NULL; + } + +print_out: + ret = trace_seq_to_user(s, ubuf, cnt); + return (ret == -EBUSY) ? 0 : ret; +} + +static int mmio_print_rw(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct mmiotrace_rw *rw = &entry->mmiorw; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(entry->t); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret = 1; + + switch (entry->mmiorw.opcode) { + case MMIO_READ: + ret = trace_seq_printf(s, + "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + rw->value, rw->pc, 0); + break; + case MMIO_WRITE: + ret = trace_seq_printf(s, + "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + rw->value, rw->pc, 0); + break; + case MMIO_UNKNOWN_OP: + ret = trace_seq_printf(s, + "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n", + secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, + (rw->value >> 0) & 0xff, rw->pc, 0); + break; + default: + ret = trace_seq_printf(s, "rw what?\n"); + break; + } + if (ret) + return 1; + return 0; +} + +static int mmio_print_map(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct mmiotrace_map *m = &entry->mmiomap; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(entry->t); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret = 1; + + switch (entry->mmiorw.opcode) { + case MMIO_PROBE: + ret = trace_seq_printf(s, + "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", + secs, usec_rem, m->map_id, + (unsigned long long)m->phys, m->virt, m->len, + 0UL, 0); + break; + case MMIO_UNPROBE: + ret = trace_seq_printf(s, + "UNMAP %lu.%06lu %d 0x%lx %d\n", + secs, usec_rem, m->map_id, 0UL, 0); + break; + default: + ret = trace_seq_printf(s, "map what?\n"); + break; + } + if (ret) + return 1; + return 0; +} + +/* return 0 to abort printing without consuming current entry in pipe mode */ +static int mmio_print_line(struct trace_iterator *iter) +{ + switch (iter->ent->type) { + case TRACE_MMIO_RW: + return mmio_print_rw(iter); + case TRACE_MMIO_MAP: + return mmio_print_map(iter); + default: + return 1; /* ignore unknown entries */ + } +} + +static struct tracer mmio_tracer __read_mostly = +{ + .name = "mmiotrace", + .init = mmio_trace_init, + .reset = mmio_trace_reset, + .pipe_open = mmio_pipe_open, + .close = mmio_close, + .read = mmio_read, + .ctrl_update = mmio_trace_ctrl_update, + .print_line = mmio_print_line, +}; + +__init static int init_mmio_trace(void) +{ + return register_tracer(&mmio_tracer); +} +device_initcall(init_mmio_trace); + +void mmio_trace_rw(struct mmiotrace_rw *rw) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data = tr->data[smp_processor_id()]; + __trace_mmiotrace_rw(tr, data, rw); +} + +void mmio_trace_mapping(struct mmiotrace_map *map) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data; + + preempt_disable(); + data = tr->data[smp_processor_id()]; + __trace_mmiotrace_map(tr, data, map); + preempt_enable(); +} diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c new file mode 100644 index 000000000000..c16935d3bc5c --- /dev/null +++ b/kernel/trace/trace_sched_switch.c @@ -0,0 +1,286 @@ +/* + * trace context switch + * + * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> + * + */ +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/marker.h> +#include <linux/ftrace.h> + +#include "trace.h" + +static struct trace_array *ctx_trace; +static int __read_mostly tracer_enabled; +static atomic_t sched_ref; + +static void +sched_switch_func(void *private, void *__rq, struct task_struct *prev, + struct task_struct *next) +{ + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + tracing_record_cmdline(prev); + tracing_record_cmdline(next); + + if (!tracer_enabled) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + tracing_sched_switch_trace(tr, data, prev, next, flags); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +static notrace void +sched_switch_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *prev; + struct task_struct *next; + struct rq *__rq; + + if (!atomic_read(&sched_ref)) + return; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, int); + (void)va_arg(*args, long); + __rq = va_arg(*args, typeof(__rq)); + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + /* + * If tracer_switch_func only points to the local + * switch func, it still needs the ptr passed to it. + */ + sched_switch_func(probe_data, __rq, prev, next); +} + +static void +wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct + task_struct *curr) +{ + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (!tracer_enabled) + return; + + tracing_record_cmdline(curr); + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +static notrace void +wake_up_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *curr; + struct task_struct *task; + struct rq *__rq; + + if (likely(!tracer_enabled)) + return; + + /* Skip pid %d state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, long); + /* now get the meat: "rq %p task %p rq->curr %p" */ + __rq = va_arg(*args, typeof(__rq)); + task = va_arg(*args, typeof(task)); + curr = va_arg(*args, typeof(curr)); + + tracing_record_cmdline(task); + tracing_record_cmdline(curr); + + wakeup_func(probe_data, __rq, task, curr); +} + +static void sched_switch_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static int tracing_sched_register(void) +{ + int ret; + + ret = marker_probe_register("kernel_sched_wakeup", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &ctx_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup\n"); + return ret; + } + + ret = marker_probe_register("kernel_sched_wakeup_new", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &ctx_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = marker_probe_register("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + sched_switch_callback, + &ctx_trace); + if (ret) { + pr_info("sched trace: Couldn't add marker" + " probe to kernel_sched_schedule\n"); + goto fail_deprobe_wake_new; + } + + return ret; +fail_deprobe_wake_new: + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &ctx_trace); +fail_deprobe: + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &ctx_trace); + return ret; +} + +static void tracing_sched_unregister(void) +{ + marker_probe_unregister("kernel_sched_schedule", + sched_switch_callback, + &ctx_trace); + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &ctx_trace); + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &ctx_trace); +} + +void tracing_start_sched_switch(void) +{ + long ref; + + ref = atomic_inc_return(&sched_ref); + if (ref == 1) + tracing_sched_register(); +} + +void tracing_stop_sched_switch(void) +{ + long ref; + + ref = atomic_dec_and_test(&sched_ref); + if (ref) + tracing_sched_unregister(); +} + +void tracing_start_cmdline_record(void) +{ + tracing_start_sched_switch(); +} + +void tracing_stop_cmdline_record(void) +{ + tracing_stop_sched_switch(); +} + +static void start_sched_trace(struct trace_array *tr) +{ + sched_switch_reset(tr); + tracer_enabled = 1; + tracing_start_cmdline_record(); +} + +static void stop_sched_trace(struct trace_array *tr) +{ + tracing_stop_cmdline_record(); + tracer_enabled = 0; +} + +static void sched_switch_trace_init(struct trace_array *tr) +{ + ctx_trace = tr; + + if (tr->ctrl) + start_sched_trace(tr); +} + +static void sched_switch_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_sched_trace(tr); +} + +static void sched_switch_trace_ctrl_update(struct trace_array *tr) +{ + /* When starting a new trace, reset the buffers */ + if (tr->ctrl) + start_sched_trace(tr); + else + stop_sched_trace(tr); +} + +static struct tracer sched_switch_trace __read_mostly = +{ + .name = "sched_switch", + .init = sched_switch_trace_init, + .reset = sched_switch_trace_reset, + .ctrl_update = sched_switch_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_sched_switch, +#endif +}; + +__init static int init_sched_switch_trace(void) +{ + int ret = 0; + + if (atomic_read(&sched_ref)) + ret = tracing_sched_register(); + if (ret) { + pr_info("error registering scheduler trace\n"); + return ret; + } + return register_tracer(&sched_switch_trace); +} +device_initcall(init_sched_switch_trace); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c new file mode 100644 index 000000000000..bf7e91caef57 --- /dev/null +++ b/kernel/trace/trace_sched_wakeup.c @@ -0,0 +1,447 @@ +/* + * trace task wakeup timings + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/marker.h> + +#include "trace.h" + +static struct trace_array *wakeup_trace; +static int __read_mostly tracer_enabled; + +static struct task_struct *wakeup_task; +static int wakeup_cpu; +static unsigned wakeup_prio = -1; + +static DEFINE_SPINLOCK(wakeup_lock); + +static void __wakeup_reset(struct trace_array *tr); + +#ifdef CONFIG_FTRACE +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int resched; + int cpu; + + if (likely(!wakeup_task)) + return; + + resched = need_resched(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (unlikely(disabled != 1)) + goto out; + + spin_lock_irqsave(&wakeup_lock, flags); + + if (unlikely(!wakeup_task)) + goto unlock; + + /* + * The task can't disappear because it needs to + * wake up first, and we have the wakeup_lock. + */ + if (task_cpu(wakeup_task) != cpu) + goto unlock; + + trace_function(tr, data, ip, parent_ip, flags); + + unlock: + spin_unlock_irqrestore(&wakeup_lock, flags); + + out: + atomic_dec(&data->disabled); + + /* + * To prevent recursion from the scheduler, if the + * resched flag was set before we entered, then + * don't reschedule. + */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = wakeup_tracer_call, +}; +#endif /* CONFIG_FTRACE */ + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +static void notrace +wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, + struct task_struct *next) +{ + unsigned long latency = 0, t0 = 0, t1 = 0; + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; + struct trace_array_cpu *data; + cycle_t T0, T1, delta; + unsigned long flags; + long disabled; + int cpu; + + if (unlikely(!tracer_enabled)) + return; + + /* + * When we start a new trace, we set wakeup_task to NULL + * and then set tracer_enabled = 1. We want to make sure + * that another CPU does not see the tracer_enabled = 1 + * and the wakeup_task with an older task, that might + * actually be the same as next. + */ + smp_rmb(); + + if (next != wakeup_task) + return; + + /* The task we are waiting for is waking up */ + data = tr->data[wakeup_cpu]; + + /* disable local data, not wakeup_cpu data */ + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&tr->data[cpu]->disabled); + if (likely(disabled != 1)) + goto out; + + spin_lock_irqsave(&wakeup_lock, flags); + + /* We could race with grabbing wakeup_lock */ + if (unlikely(!tracer_enabled || next != wakeup_task)) + goto out_unlock; + + trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); + + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); + delta = T1-T0; + + if (!report_latency(delta)) + goto out_unlock; + + latency = nsecs_to_usecs(delta); + + tracing_max_latency = delta; + t0 = nsecs_to_usecs(T0); + t1 = nsecs_to_usecs(T1); + + update_max_tr(tr, wakeup_task, wakeup_cpu); + +out_unlock: + __wakeup_reset(tr); + spin_unlock_irqrestore(&wakeup_lock, flags); +out: + atomic_dec(&tr->data[cpu]->disabled); +} + +static notrace void +sched_switch_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *prev; + struct task_struct *next; + struct rq *__rq; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, int); + (void)va_arg(*args, long); + __rq = va_arg(*args, typeof(__rq)); + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + tracing_record_cmdline(prev); + + /* + * If tracer_switch_func only points to the local + * switch func, it still needs the ptr passed to it. + */ + wakeup_sched_switch(probe_data, __rq, prev, next); +} + +static void __wakeup_reset(struct trace_array *tr) +{ + struct trace_array_cpu *data; + int cpu; + + assert_spin_locked(&wakeup_lock); + + for_each_possible_cpu(cpu) { + data = tr->data[cpu]; + tracing_reset(data); + } + + wakeup_cpu = -1; + wakeup_prio = -1; + + if (wakeup_task) + put_task_struct(wakeup_task); + + wakeup_task = NULL; +} + +static void wakeup_reset(struct trace_array *tr) +{ + unsigned long flags; + + spin_lock_irqsave(&wakeup_lock, flags); + __wakeup_reset(tr); + spin_unlock_irqrestore(&wakeup_lock, flags); +} + +static void +wakeup_check_start(struct trace_array *tr, struct task_struct *p, + struct task_struct *curr) +{ + int cpu = smp_processor_id(); + unsigned long flags; + long disabled; + + if (likely(!rt_task(p)) || + p->prio >= wakeup_prio || + p->prio >= curr->prio) + return; + + disabled = atomic_inc_return(&tr->data[cpu]->disabled); + if (unlikely(disabled != 1)) + goto out; + + /* interrupts should be off from try_to_wake_up */ + spin_lock(&wakeup_lock); + + /* check for races. */ + if (!tracer_enabled || p->prio >= wakeup_prio) + goto out_locked; + + /* reset the trace */ + __wakeup_reset(tr); + + wakeup_cpu = task_cpu(p); + wakeup_prio = p->prio; + + wakeup_task = p; + get_task_struct(wakeup_task); + + local_save_flags(flags); + + tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); + trace_function(tr, tr->data[wakeup_cpu], + CALLER_ADDR1, CALLER_ADDR2, flags); + +out_locked: + spin_unlock(&wakeup_lock); +out: + atomic_dec(&tr->data[cpu]->disabled); +} + +static notrace void +wake_up_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array **ptr = probe_data; + struct trace_array *tr = *ptr; + struct task_struct *curr; + struct task_struct *task; + struct rq *__rq; + + if (likely(!tracer_enabled)) + return; + + /* Skip pid %d state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, long); + /* now get the meat: "rq %p task %p rq->curr %p" */ + __rq = va_arg(*args, typeof(__rq)); + task = va_arg(*args, typeof(task)); + curr = va_arg(*args, typeof(curr)); + + tracing_record_cmdline(task); + tracing_record_cmdline(curr); + + wakeup_check_start(tr, task, curr); +} + +static void start_wakeup_tracer(struct trace_array *tr) +{ + int ret; + + ret = marker_probe_register("kernel_sched_wakeup", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &wakeup_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup\n"); + return; + } + + ret = marker_probe_register("kernel_sched_wakeup_new", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &wakeup_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = marker_probe_register("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + sched_switch_callback, + &wakeup_trace); + if (ret) { + pr_info("sched trace: Couldn't add marker" + " probe to kernel_sched_schedule\n"); + goto fail_deprobe_wake_new; + } + + wakeup_reset(tr); + + /* + * Don't let the tracer_enabled = 1 show up before + * the wakeup_task is reset. This may be overkill since + * wakeup_reset does a spin_unlock after setting the + * wakeup_task to NULL, but I want to be safe. + * This is a slow path anyway. + */ + smp_wmb(); + + tracer_enabled = 1; + register_ftrace_function(&trace_ops); + + return; +fail_deprobe_wake_new: + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &wakeup_trace); +fail_deprobe: + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &wakeup_trace); +} + +static void stop_wakeup_tracer(struct trace_array *tr) +{ + tracer_enabled = 0; + unregister_ftrace_function(&trace_ops); + marker_probe_unregister("kernel_sched_schedule", + sched_switch_callback, + &wakeup_trace); + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &wakeup_trace); + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &wakeup_trace); +} + +static void wakeup_tracer_init(struct trace_array *tr) +{ + wakeup_trace = tr; + + if (tr->ctrl) + start_wakeup_tracer(tr); +} + +static void wakeup_tracer_reset(struct trace_array *tr) +{ + if (tr->ctrl) { + stop_wakeup_tracer(tr); + /* make sure we put back any tasks we are tracing */ + wakeup_reset(tr); + } +} + +static void wakeup_tracer_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_wakeup_tracer(tr); + else + stop_wakeup_tracer(tr); +} + +static void wakeup_tracer_open(struct trace_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + stop_wakeup_tracer(iter->tr); +} + +static void wakeup_tracer_close(struct trace_iterator *iter) +{ + /* forget about any processes we were recording */ + if (iter->tr->ctrl) + start_wakeup_tracer(iter->tr); +} + +static struct tracer wakeup_tracer __read_mostly = +{ + .name = "wakeup", + .init = wakeup_tracer_init, + .reset = wakeup_tracer_reset, + .open = wakeup_tracer_open, + .close = wakeup_tracer_close, + .ctrl_update = wakeup_tracer_ctrl_update, + .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif +}; + +__init static int init_wakeup_tracer(void) +{ + int ret; + + ret = register_tracer(&wakeup_tracer); + if (ret) + return ret; + + return 0; +} +device_initcall(init_wakeup_tracer); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c new file mode 100644 index 000000000000..0911b7e073bf --- /dev/null +++ b/kernel/trace/trace_selftest.c @@ -0,0 +1,563 @@ +/* Include in trace.c */ + +#include <linux/kthread.h> +#include <linux/delay.h> + +static inline int trace_valid_entry(struct trace_entry *entry) +{ + switch (entry->type) { + case TRACE_FN: + case TRACE_CTX: + case TRACE_WAKE: + case TRACE_STACK: + case TRACE_SPECIAL: + return 1; + } + return 0; +} + +static int +trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) +{ + struct trace_entry *entries; + struct page *page; + int idx = 0; + int i; + + BUG_ON(list_empty(&data->trace_pages)); + page = list_entry(data->trace_pages.next, struct page, lru); + entries = page_address(page); + + check_pages(data); + if (head_page(data) != entries) + goto failed; + + /* + * The starting trace buffer always has valid elements, + * if any element exists. + */ + entries = head_page(data); + + for (i = 0; i < tr->entries; i++) { + + if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) { + printk(KERN_CONT ".. invalid entry %d ", + entries[idx].type); + goto failed; + } + + idx++; + if (idx >= ENTRIES_PER_PAGE) { + page = virt_to_page(entries); + if (page->lru.next == &data->trace_pages) { + if (i != tr->entries - 1) { + printk(KERN_CONT ".. entries buffer mismatch"); + goto failed; + } + } else { + page = list_entry(page->lru.next, struct page, lru); + entries = page_address(page); + } + idx = 0; + } + } + + page = virt_to_page(entries); + if (page->lru.next != &data->trace_pages) { + printk(KERN_CONT ".. too many entries"); + goto failed; + } + + return 0; + + failed: + /* disable tracing */ + tracing_disabled = 1; + printk(KERN_CONT ".. corrupted trace buffer .. "); + return -1; +} + +/* + * Test the trace buffer to see if all the elements + * are still sane. + */ +static int trace_test_buffer(struct trace_array *tr, unsigned long *count) +{ + unsigned long flags, cnt = 0; + int cpu, ret = 0; + + /* Don't allow flipping of max traces now */ + raw_local_irq_save(flags); + __raw_spin_lock(&ftrace_max_lock); + for_each_possible_cpu(cpu) { + if (!head_page(tr->data[cpu])) + continue; + + cnt += tr->data[cpu]->trace_idx; + + ret = trace_test_buffer_cpu(tr, tr->data[cpu]); + if (ret) + break; + } + __raw_spin_unlock(&ftrace_max_lock); + raw_local_irq_restore(flags); + + if (count) + *count = cnt; + + return ret; +} + +#ifdef CONFIG_FTRACE + +#ifdef CONFIG_DYNAMIC_FTRACE + +#define __STR(x) #x +#define STR(x) __STR(x) + +/* Test dynamic code modification and ftrace filters */ +int trace_selftest_startup_dynamic_tracing(struct tracer *trace, + struct trace_array *tr, + int (*func)(void)) +{ + unsigned long count; + int ret; + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + char *func_name; + + /* The ftrace test PASSED */ + printk(KERN_CONT "PASSED\n"); + pr_info("Testing dynamic ftrace: "); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + tracer_enabled = 1; + + /* passed in by parameter to fool gcc from optimizing */ + func(); + + /* update the records */ + ret = ftrace_force_update(); + if (ret) { + printk(KERN_CONT ".. ftraced failed .. "); + return ret; + } + + /* + * Some archs *cough*PowerPC*cough* add charachters to the + * start of the function names. We simply put a '*' to + * accomodate them. + */ + func_name = "*" STR(DYN_FTRACE_TEST_NAME); + + /* filter only on our function */ + ftrace_set_filter(func_name, strlen(func_name), 1); + + /* enable tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + + /* we should have nothing in the buffer */ + ret = trace_test_buffer(tr, &count); + if (ret) + goto out; + + if (count) { + ret = -1; + printk(KERN_CONT ".. filter did not filter .. "); + goto out; + } + + /* call our function again */ + func(); + + /* sleep again */ + msleep(100); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + ftrace_enabled = 0; + + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + /* we should only have one item */ + if (!ret && count != 1) { + printk(KERN_CONT ".. filter failed count=%ld ..", count); + ret = -1; + goto out; + } + out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + /* Enable tracing on all functions again */ + ftrace_set_filter(NULL, 0, 1); + + return ret; +} +#else +# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) +#endif /* CONFIG_DYNAMIC_FTRACE */ +/* + * Simple verification test of ftrace function tracer. + * Enable ftrace, sleep 1/10 second, and then read the trace + * buffer to see if all is in order. + */ +int +trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + + /* make sure msleep has been recorded */ + msleep(1); + + /* force the recorded functions to be traced */ + ret = ftrace_force_update(); + if (ret) { + printk(KERN_CONT ".. ftraced failed .. "); + return ret; + } + + /* start the tracing */ + ftrace_enabled = 1; + tracer_enabled = 1; + + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + ftrace_enabled = 0; + + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + ret = trace_selftest_startup_dynamic_tracing(trace, tr, + DYN_FTRACE_TEST_NAME); + + out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + /* kill ftrace totally if we failed */ + if (ret) + ftrace_kill(); + + return ret; +} +#endif /* CONFIG_FTRACE */ + +#ifdef CONFIG_IRQSOFF_TRACER +int +trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* reset the max latency */ + tracing_max_latency = 0; + /* disable interrupts for a bit */ + local_irq_disable(); + udelay(100); + local_irq_enable(); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +int +trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* reset the max latency */ + tracing_max_latency = 0; + /* disable preemption for a bit */ + preempt_disable(); + udelay(100); + preempt_enable(); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_PREEMPT_TRACER */ + +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) +int +trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + + /* reset the max latency */ + tracing_max_latency = 0; + + /* disable preemption and interrupts for a bit */ + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&max_tr, &count); + if (ret) + goto out; + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + /* do the test by disabling interrupts first this time */ + tracing_max_latency = 0; + tr->ctrl = 1; + trace->ctrl_update(tr); + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&max_tr, &count); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + out: + trace->reset(tr); + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ + +#ifdef CONFIG_SCHED_TRACER +static int trace_wakeup_test_thread(void *data) +{ + /* Make this a RT thread, doesn't need to be too high */ + struct sched_param param = { .sched_priority = 5 }; + struct completion *x = data; + + sched_setscheduler(current, SCHED_FIFO, ¶m); + + /* Make it know we have a new prio */ + complete(x); + + /* now go to sleep and let the test wake us up */ + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + + /* we are awake, now wait to disappear */ + while (!kthread_should_stop()) { + /* + * This is an RT task, do short sleeps to let + * others run. + */ + msleep(100); + } + + return 0; +} + +int +trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + struct task_struct *p; + struct completion isrt; + unsigned long count; + int ret; + + init_completion(&isrt); + + /* create a high prio thread */ + p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); + if (IS_ERR(p)) { + printk(KERN_CONT "Failed to create ftrace wakeup test thread "); + return -1; + } + + /* make sure the thread is running at an RT prio */ + wait_for_completion(&isrt); + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* reset the max latency */ + tracing_max_latency = 0; + + /* sleep to let the RT thread sleep too */ + msleep(100); + + /* + * Yes this is slightly racy. It is possible that for some + * strange reason that the RT thread we created, did not + * call schedule for 100ms after doing the completion, + * and we do a wakeup on a task that already is awake. + * But that is extremely unlikely, and the worst thing that + * happens in such a case, is that we disable tracing. + * Honestly, if this race does happen something is horrible + * wrong with the system. + */ + + wake_up_process(p); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + + + trace->reset(tr); + + tracing_max_latency = save_max; + + /* kill the thread */ + kthread_stop(p); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_SCHED_TRACER */ + +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +int +trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + +#ifdef CONFIG_SYSPROF_TRACER +int +trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + return ret; +} +#endif /* CONFIG_SYSPROF_TRACER */ diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c new file mode 100644 index 000000000000..54dd77cce5bf --- /dev/null +++ b/kernel/trace/trace_selftest_dynamic.c @@ -0,0 +1,7 @@ +#include "trace.h" + +int DYN_FTRACE_TEST_NAME(void) +{ + /* used to call mcount */ + return 0; +} diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c new file mode 100644 index 000000000000..2301e1e7c606 --- /dev/null +++ b/kernel/trace/trace_sysprof.c @@ -0,0 +1,363 @@ +/* + * trace stack traces + * + * Copyright (C) 2004-2008, Soeren Sandmann + * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + */ +#include <linux/kallsyms.h> +#include <linux/debugfs.h> +#include <linux/hrtimer.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/irq.h> +#include <linux/fs.h> + +#include <asm/stacktrace.h> + +#include "trace.h" + +static struct trace_array *sysprof_trace; +static int __read_mostly tracer_enabled; + +/* + * 1 msec sample interval by default: + */ +static unsigned long sample_period = 1000000; +static const unsigned int sample_max_depth = 512; + +static DEFINE_MUTEX(sample_timer_lock); +/* + * Per CPU hrtimers that do the profiling: + */ +static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer); + +struct stack_frame { + const void __user *next_fp; + unsigned long return_address; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + int ret; + + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) + return 0; + + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); + + return ret; +} + +struct backtrace_info { + struct trace_array_cpu *data; + struct trace_array *tr; + int pos; +}; + +static void +backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + /* Ignore warnings */ +} + +static void backtrace_warning(void *data, char *msg) +{ + /* Ignore warnings */ +} + +static int backtrace_stack(void *data, char *name) +{ + /* Don't bother with IRQ stacks for now */ + return -1; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ + struct backtrace_info *info = data; + + if (info->pos < sample_max_depth && reliable) { + __trace_special(info->tr, info->data, 1, addr, 0); + + info->pos++; + } +} + +const static struct stacktrace_ops backtrace_ops = { + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, +}; + +static int +trace_kernel(struct pt_regs *regs, struct trace_array *tr, + struct trace_array_cpu *data) +{ + struct backtrace_info info; + unsigned long bp; + char *stack; + + info.tr = tr; + info.data = data; + info.pos = 1; + + __trace_special(info.tr, info.data, 1, regs->ip, 0); + + stack = ((char *)regs + sizeof(struct pt_regs)); +#ifdef CONFIG_FRAME_POINTER + bp = regs->bp; +#else + bp = 0; +#endif + + dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info); + + return info.pos; +} + +static void timer_notify(struct pt_regs *regs, int cpu) +{ + struct trace_array_cpu *data; + struct stack_frame frame; + struct trace_array *tr; + const void __user *fp; + int is_user; + int i; + + if (!regs) + return; + + tr = sysprof_trace; + data = tr->data[cpu]; + is_user = user_mode(regs); + + if (!current || current->pid == 0) + return; + + if (is_user && current->state != TASK_RUNNING) + return; + + __trace_special(tr, data, 0, 0, current->pid); + + if (!is_user) + i = trace_kernel(regs, tr, data); + else + i = 0; + + /* + * Trace user stack if we are not a kernel thread + */ + if (current->mm && i < sample_max_depth) { + regs = (struct pt_regs *)current->thread.sp0 - 1; + + fp = (void __user *)regs->bp; + + __trace_special(tr, data, 2, regs->ip, 0); + + while (i < sample_max_depth) { + frame.next_fp = 0; + frame.return_address = 0; + if (!copy_stack_frame(fp, &frame)) + break; + if ((unsigned long)fp < regs->sp) + break; + + __trace_special(tr, data, 2, frame.return_address, + (unsigned long)fp); + fp = frame.next_fp; + + i++; + } + + } + + /* + * Special trace entry if we overflow the max depth: + */ + if (i == sample_max_depth) + __trace_special(tr, data, -1, -1, -1); + + __trace_special(tr, data, 3, current->pid, i); +} + +static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) +{ + /* trace here */ + timer_notify(get_irq_regs(), smp_processor_id()); + + hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); + + return HRTIMER_RESTART; +} + +static void start_stack_timer(int cpu) +{ + struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); + + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = stack_trace_timer_fn; + hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + + hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); +} + +static void start_stack_timers(void) +{ + cpumask_t saved_mask = current->cpus_allowed; + int cpu; + + for_each_online_cpu(cpu) { + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + start_stack_timer(cpu); + } + set_cpus_allowed_ptr(current, &saved_mask); +} + +static void stop_stack_timer(int cpu) +{ + struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); + + hrtimer_cancel(hrtimer); +} + +static void stop_stack_timers(void) +{ + int cpu; + + for_each_online_cpu(cpu) + stop_stack_timer(cpu); +} + +static void stack_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static void start_stack_trace(struct trace_array *tr) +{ + mutex_lock(&sample_timer_lock); + stack_reset(tr); + start_stack_timers(); + tracer_enabled = 1; + mutex_unlock(&sample_timer_lock); +} + +static void stop_stack_trace(struct trace_array *tr) +{ + mutex_lock(&sample_timer_lock); + stop_stack_timers(); + tracer_enabled = 0; + mutex_unlock(&sample_timer_lock); +} + +static void stack_trace_init(struct trace_array *tr) +{ + sysprof_trace = tr; + + if (tr->ctrl) + start_stack_trace(tr); +} + +static void stack_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_stack_trace(tr); +} + +static void stack_trace_ctrl_update(struct trace_array *tr) +{ + /* When starting a new trace, reset the buffers */ + if (tr->ctrl) + start_stack_trace(tr); + else + stop_stack_trace(tr); +} + +static struct tracer stack_trace __read_mostly = +{ + .name = "sysprof", + .init = stack_trace_init, + .reset = stack_trace_reset, + .ctrl_update = stack_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_sysprof, +#endif +}; + +__init static int init_stack_trace(void) +{ + return register_tracer(&stack_trace); +} +device_initcall(init_stack_trace); + +#define MAX_LONG_DIGITS 22 + +static ssize_t +sysprof_sample_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_LONG_DIGITS]; + int r; + + r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period)); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +sysprof_sample_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_LONG_DIGITS]; + unsigned long val; + + if (cnt > MAX_LONG_DIGITS-1) + cnt = MAX_LONG_DIGITS-1; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + val = simple_strtoul(buf, NULL, 10); + /* + * Enforce a minimum sample period of 100 usecs: + */ + if (val < 100) + val = 100; + + mutex_lock(&sample_timer_lock); + stop_stack_timers(); + sample_period = val * 1000; + start_stack_timers(); + mutex_unlock(&sample_timer_lock); + + return cnt; +} + +static struct file_operations sysprof_sample_fops = { + .read = sysprof_sample_read, + .write = sysprof_sample_write, +}; + +void init_tracer_sysprof_debugfs(struct dentry *d_tracer) +{ + struct dentry *entry; + + entry = debugfs_create_file("sysprof_sample_period", 0644, + d_tracer, NULL, &sysprof_sample_fops); + if (entry) + return; + pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n"); +} |