56 files changed, 8805 insertions, 2811 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..40b0285e41ad 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,18 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
 
+CFLAGS_REMOVE_sched.o = -pg -mno-spe
+
+ifdef CONFIG_FTRACE
+# Do not trace debug files and internal ftrace files
+CFLAGS_REMOVE_lockdep.o = -pg
+CFLAGS_REMOVE_lockdep_proc.o = -pg
+CFLAGS_REMOVE_mutex-debug.o = -pg
+CFLAGS_REMOVE_rtmutex-debug.o = -pg
+CFLAGS_REMOVE_cgroup-debug.o = -pg
+CFLAGS_REMOVE_sched_clock.o = -pg
+endif
+
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
@@ -25,7 +37,6 @@ obj-$(CONFIG_FUTEX) += futex_compat.o
 endif
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
-obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
@@ -36,7 +47,6 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PM) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
-obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
@@ -48,7 +58,6 @@ obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
-obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
@@ -57,7 +66,6 @@ obj-$(CONFIG_KGDB) += kgdb.o
 obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
-obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
 obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
 ifeq ($(CONFIG_PREEMPT_RCU),y)
@@ -69,6 +77,8 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_FTRACE) += trace/
+obj-$(CONFIG_TRACING) += trace/
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
index b7d3709cc452..e8692a5748c2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -572,16 +572,17 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
 
 	skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
 	if (!skb)
-		return;
+		goto out;
 
 	reply->pid = pid;
 	reply->skb = skb;
 
 	tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
-	if (IS_ERR(tsk)) {
-		kfree(reply);
-		kfree_skb(skb);
-	}
+	if (!IS_ERR(tsk))
+		return;
+	kfree_skb(skb);
+out:
+	kfree(reply);
 }
 
 /*
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 9ef5e0aacc3c..f7921a2ecf16 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -172,10 +172,9 @@ static void insert_hash(struct audit_chunk *chunk)
 struct audit_chunk *audit_tree_lookup(const struct inode *inode)
 {
 	struct list_head *list = chunk_hash(inode);
-	struct list_head *pos;
+	struct audit_chunk *p;
 
-	list_for_each_rcu(pos, list) {
-		struct audit_chunk *p = container_of(pos, struct audit_chunk, hash);
+	list_for_each_entry_rcu(p, list, hash) {
 		if (p->watch.inode == inode) {
 			get_inotify_watch(&p->watch);
 			return p;
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
deleted file mode 100644
index d1a7605c5b8f..000000000000
--- a/kernel/backtracetest.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Simple stack backtrace regression test module
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/delay.h>
-
-static struct timer_list backtrace_timer;
-
-static void backtrace_test_timer(unsigned long data)
-{
-	printk("Testing a backtrace from irq context.\n");
-	printk("The following trace is a kernel self test and not a bug!\n");
-	dump_stack();
-}
-static int backtrace_regression_test(void)
-{
-	printk("====[ backtrace testing ]===========\n");
-	printk("Testing a backtrace from process context.\n");
-	printk("The following trace is a kernel self test and not a bug!\n");
-	dump_stack();
-
-	init_timer(&backtrace_timer);
-	backtrace_timer.function = backtrace_test_timer;
-	mod_timer(&backtrace_timer, jiffies + 10);
-
-	msleep(10);
-	printk("====[ end of backtrace testing ]====\n");
-	return 0;
-}
-
-static void exitf(void)
-{
-}
-
-module_init(backtrace_regression_test);
-module_exit(exitf);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fbc6fc8949b4..15ac0e1e4f4d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2903,7 +2903,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
 	cg = tsk->cgroups;
 	parent = task_cgroup(tsk, subsys->subsys_id);
 
-	snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid);
+	snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid);
 
 	/* Pin the hierarchy */
 	atomic_inc(&parent->root->sb->s_active);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c77bc3a1c722..787d3f4f7a63 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -192,7 +192,6 @@ static int __ref take_cpu_down(void *_param)
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
 	int err, nr_calls = 0;
-	struct task_struct *p;
 	cpumask_t old_allowed, tmp;
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
@@ -226,19 +225,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	cpu_clear(cpu, tmp);
 	set_cpus_allowed_ptr(current, &tmp);
 
-	p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
+	err = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
 
-	if (IS_ERR(p) || cpu_online(cpu)) {
+	if (err || cpu_online(cpu)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
 					    hcpu) == NOTIFY_BAD)
 			BUG();
 
-		if (IS_ERR(p)) {
-			err = PTR_ERR(p);
-			goto out_allowed;
-		}
-		goto out_thread;
+		goto out_allowed;
 	}
 
 	/* Wait for it to sleep (leaving idle task). */
@@ -255,8 +250,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	check_for_tasks(cpu);
 
-out_thread:
-	err = kthread_stop(p);
 out_allowed:
 	set_cpus_allowed_ptr(current, &old_allowed);
 out_release:
@@ -277,6 +270,7 @@ int __ref cpu_down(unsigned int cpu)
 	cpu_maps_update_done();
 	return err;
 }
+EXPORT_SYMBOL(cpu_down);
 #endif /*CONFIG_HOTPLUG_CPU*/
 
 /* Requires cpu_add_remove_lock to be held */
diff --git a/kernel/exit.c b/kernel/exit.c
index 1510f78a0ffa..fb8de6cbf2c7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -126,6 +126,12 @@ static void __exit_signal(struct task_struct *tsk)
 
 	__unhash_process(tsk);
 
+	/*
+	 * Do this under ->siglock, we can race with another thread
+	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
+	 */
+	flush_sigqueue(&tsk->pending);
+
 	tsk->signal = NULL;
 	tsk->sighand = NULL;
 	spin_unlock(&sighand->siglock);
@@ -133,7 +139,6 @@ static void __exit_signal(struct task_struct *tsk)
 
 	__cleanup_sighand(sighand);
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
-	flush_sigqueue(&tsk->pending);
 	if (sig) {
 		flush_sigqueue(&sig->shared_pending);
 		taskstats_tgid_free(sig);
@@ -894,12 +899,9 @@ static void check_stack_usage(void)
 {
 	static DEFINE_SPINLOCK(low_water_lock);
 	static int lowest_to_date = THREAD_SIZE;
-	unsigned long *n = end_of_stack(current);
 	unsigned long free;
 
-	while (*n == 0)
-		n++;
-	free = (unsigned long)n - (unsigned long)end_of_stack(current);
+	free = stack_not_used(current);
 
 	if (free >= lowest_to_date)
 		return;
diff --git a/kernel/fork.c b/kernel/fork.c
index 933e60ebccae..b98b08aecee0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -54,6 +54,7 @@
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
+#include <linux/magic.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -186,6 +187,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 {
 	struct task_struct *tsk;
 	struct thread_info *ti;
+	unsigned long *stackend;
+
 	int err;
 
 	prepare_to_copy(orig);
@@ -211,6 +214,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 		goto out;
 
 	setup_thread_stack(tsk, orig);
+	stackend = end_of_stack(tsk);
+	*stackend = STACK_END_MAGIC;	/* for overflow detection */
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 	tsk->stack_canary = get_random_int();
@@ -660,136 +665,6 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 	return 0;
 }
 
-static int count_open_files(struct fdtable *fdt)
-{
-	int size = fdt->max_fds;
-	int i;
-
-	/* Find the last open fd */
-	for (i = size/(8*sizeof(long)); i > 0; ) {
-		if (fdt->open_fds->fds_bits[--i])
-			break;
-	}
-	i = (i+1) * 8 * sizeof(long);
-	return i;
-}
-
-static struct files_struct *alloc_files(void)
-{
-	struct files_struct *newf;
-	struct fdtable *fdt;
-
-	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
-	if (!newf)
-		goto out;
-
-	atomic_set(&newf->count, 1);
-
-	spin_lock_init(&newf->file_lock);
-	newf->next_fd = 0;
-	fdt = &newf->fdtab;
-	fdt->max_fds = NR_OPEN_DEFAULT;
-	fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
-	fdt->open_fds = (fd_set *)&newf->open_fds_init;
-	fdt->fd = &newf->fd_array[0];
-	INIT_RCU_HEAD(&fdt->rcu);
-	fdt->next = NULL;
-	rcu_assign_pointer(newf->fdt, fdt);
-out:
-	return newf;
-}
-
-/*
- * Allocate a new files structure and copy contents from the
- * passed in files structure.
- * errorp will be valid only when the returned files_struct is NULL.
- */
-static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
-{
-	struct files_struct *newf;
-	struct file **old_fds, **new_fds;
-	int open_files, size, i;
-	struct fdtable *old_fdt, *new_fdt;
-
-	*errorp = -ENOMEM;
-	newf = alloc_files();
-	if (!newf)
-		goto out;
-
-	spin_lock(&oldf->file_lock);
-	old_fdt = files_fdtable(oldf);
-	new_fdt = files_fdtable(newf);
-	open_files = count_open_files(old_fdt);
-
-	/*
-	 * Check whether we need to allocate a larger fd array and fd set.
-	 * Note: we're not a clone task, so the open count won't change.
-	 */
-	if (open_files > new_fdt->max_fds) {
-		new_fdt->max_fds = 0;
-		spin_unlock(&oldf->file_lock);
-		spin_lock(&newf->file_lock);
-		*errorp = expand_files(newf, open_files-1);
-		spin_unlock(&newf->file_lock);
-		if (*errorp < 0)
-			goto out_release;
-		new_fdt = files_fdtable(newf);
-		/*
-		 * Reacquire the oldf lock and a pointer to its fd table
-		 * who knows it may have a new bigger fd table. We need
-		 * the latest pointer.
-		 */
-		spin_lock(&oldf->file_lock);
-		old_fdt = files_fdtable(oldf);
-	}
-
-	old_fds = old_fdt->fd;
-	new_fds = new_fdt->fd;
-
-	memcpy(new_fdt->open_fds->fds_bits,
-		old_fdt->open_fds->fds_bits, open_files/8);
-	memcpy(new_fdt->close_on_exec->fds_bits,
-		old_fdt->close_on_exec->fds_bits, open_files/8);
-
-	for (i = open_files; i != 0; i--) {
-		struct file *f = *old_fds++;
-		if (f) {
-			get_file(f);
-		} else {
-			/*
-			 * The fd may be claimed in the fd bitmap but not yet
-			 * instantiated in the files array if a sibling thread
-			 * is partway through open().  So make sure that this
-			 * fd is available to the new process.
-			 */
-			FD_CLR(open_files - i, new_fdt->open_fds);
-		}
-		rcu_assign_pointer(*new_fds++, f);
-	}
-	spin_unlock(&oldf->file_lock);
-
-	/* compute the remainder to be cleared */
-	size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
-
-	/* This is long word aligned thus could use a optimized version */
-	memset(new_fds, 0, size);
-
-	if (new_fdt->max_fds > open_files) {
-		int left = (new_fdt->max_fds-open_files)/8;
-		int start = open_files / (8 * sizeof(unsigned long));
-
-		memset(&new_fdt->open_fds->fds_bits[start], 0, left);
-		memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
-	}
-
-	return newf;
-
-out_release:
-	kmem_cache_free(files_cachep, newf);
-out:
-	return NULL;
-}
-
 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct files_struct *oldf, *newf;
@@ -1039,7 +914,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	rt_mutex_init_task(p);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP)
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 421be5fe5cc7..543d9ca9b4f4 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1078,7 +1078,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 
-#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
+#ifdef CONFIG_NO_HZ
 /**
  * hrtimer_get_next_event - get the time until next expiry event
  *
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bd1b9ea024e1..9eb21645765f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -111,29 +111,10 @@ static void create_kthread(struct kthread_create_info *create)
 	complete(&create->done);
 }
 
-/**
- * kthread_create - create a kthread.
- * @threadfn: the function to run until signal_pending(current).
- * @data: data ptr for @threadfn.
- * @namefmt: printf-style name for the thread.
- *
- * Description: This helper function creates and names a kernel
- * thread.  The thread will be stopped: use wake_up_process() to start
- * it.  See also kthread_run(), kthread_create_on_cpu().
- *
- * When woken, the thread will run @threadfn() with @data as its
- * argument. @threadfn() can either call do_exit() directly if it is a
- * standalone thread for which noone will call kthread_stop(), or
- * return when 'kthread_should_stop()' is true (which means
- * kthread_stop() has been called).  The return value should be zero
- * or a negative error number; it will be passed to kthread_stop().
- *
- * Returns a task_struct or ERR_PTR(-ENOMEM).
- */
-struct task_struct *kthread_create(int (*threadfn)(void *data),
-				   void *data,
-				   const char namefmt[],
-				   ...)
+struct task_struct *__kthread_create(int (*threadfn)(void *data),
+				     void *data,
+				     const char namefmt[],
+				     ...)
 {
 	struct kthread_create_info create;
 
@@ -158,7 +139,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 	}
 	return create.result;
 }
-EXPORT_SYMBOL(kthread_create);
+EXPORT_SYMBOL(__kthread_create);
 
 /**
  * kthread_bind - bind a just-created kthread to a cpu.
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 81a4e4a3f087..65548eff029e 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -39,6 +39,7 @@
 #include <linux/irqflags.h>
 #include <linux/utsname.h>
 #include <linux/hash.h>
+#include <linux/ftrace.h>
 
 #include <asm/sections.h>
 
@@ -81,6 +82,8 @@ static int graph_lock(void)
 		__raw_spin_unlock(&lockdep_lock);
 		return 0;
 	}
+	/* prevent any recursions within lockdep from causing deadlocks */
+	current->lockdep_recursion++;
 	return 1;
 }
 
@@ -89,6 +92,7 @@ static inline int graph_unlock(void)
 	if (debug_locks && !__raw_spin_is_locked(&lockdep_lock))
 		return DEBUG_LOCKS_WARN_ON(1);
 
+	current->lockdep_recursion--;
 	__raw_spin_unlock(&lockdep_lock);
 	return 0;
 }
@@ -982,7 +986,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
 	return 1;
 }
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
  * Forwards and backwards subgraph searching, for the purposes of
  * proving that two subgraphs can be connected by a new dependency
@@ -1680,7 +1684,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
 		     enum lock_usage_bit new_bit);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 
 /*
  * print irq inversion bug:
@@ -2013,11 +2017,13 @@ void early_boot_irqs_on(void)
 /*
  * Hardirqs will be enabled:
  */
-void trace_hardirqs_on(void)
+void trace_hardirqs_on_caller(unsigned long a0)
 {
 	struct task_struct *curr = current;
 	unsigned long ip;
 
+	time_hardirqs_on(CALLER_ADDR0, a0);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2055,16 +2061,23 @@ void trace_hardirqs_on(void)
 	curr->hardirq_enable_event = ++curr->irq_events;
 	debug_atomic_inc(&hardirqs_on_events);
 }
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
+void trace_hardirqs_on(void)
+{
+	trace_hardirqs_on_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 /*
  * Hardirqs were disabled:
  */
-void trace_hardirqs_off(void)
+void trace_hardirqs_off_caller(unsigned long a0)
 {
 	struct task_struct *curr = current;
 
+	time_hardirqs_off(CALLER_ADDR0, a0);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2082,7 +2095,12 @@ void trace_hardirqs_off(void)
 	} else
 		debug_atomic_inc(&redundant_hardirqs_off);
 }
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
+void trace_hardirqs_off(void)
+{
+	trace_hardirqs_off_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 /*
@@ -2246,7 +2264,7 @@ static inline int separate_irq_context(struct task_struct *curr,
  * Mark a lock with a usage bit, and validate the state transition:
  */
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
-		     enum lock_usage_bit new_bit)
+			     enum lock_usage_bit new_bit)
 {
 	unsigned int new_mask = 1 << new_bit, ret = 1;
 
@@ -2686,7 +2704,7 @@ static void check_flags(unsigned long flags)
  * and also avoid lockdep recursion:
  */
 void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
-		  int trylock, int read, int check, unsigned long ip)
+			  int trylock, int read, int check, unsigned long ip)
 {
 	unsigned long flags;
 
@@ -2708,7 +2726,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 
 EXPORT_SYMBOL_GPL(lock_acquire);
 
-void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+void lock_release(struct lockdep_map *lock, int nested,
+			  unsigned long ip)
 {
 	unsigned long flags;
 
diff --git a/kernel/marker.c b/kernel/marker.c
index b5a9fe1d50d5..1abfb923b761 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -55,8 +55,8 @@ static DEFINE_MUTEX(markers_mutex);
 struct marker_entry {
 	struct hlist_node hlist;
 	char *format;
-	void (*call)(const struct marker *mdata,	/* Probe wrapper */
-		void *call_private, const char *fmt, ...);
+			/* Probe wrapper */
+	void (*call)(const struct marker *mdata, void *call_private, ...);
 	struct marker_probe_closure single;
 	struct marker_probe_closure *multi;
 	int refcount;	/* Number of times armed. 0 if disarmed. */
@@ -91,15 +91,13 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
  * marker_probe_cb Callback that prepares the variable argument list for probes.
  * @mdata: pointer of type struct marker
  * @call_private: caller site private data
- * @fmt: format string
  * @...:  Variable argument list.
  *
  * Since we do not use "typical" pointer based RCU in the 1 argument case, we
  * need to put a full smp_rmb() in this branch. This is why we do not use
  * rcu_dereference() for the pointer read.
  */
-void marker_probe_cb(const struct marker *mdata, void *call_private,
-	const char *fmt, ...)
+void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
 {
 	va_list args;
 	char ptype;
@@ -120,8 +118,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
 		/* Must read the ptr before private data. They are not data
 		 * dependant, so we put an explicit smp_rmb() here. */
 		smp_rmb();
-		va_start(args, fmt);
-		func(mdata->single.probe_private, call_private, fmt, &args);
+		va_start(args, call_private);
+		func(mdata->single.probe_private, call_private, mdata->format,
+			&args);
 		va_end(args);
 	} else {
 		struct marker_probe_closure *multi;
@@ -136,9 +135,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
 		smp_read_barrier_depends();
 		multi = mdata->multi;
 		for (i = 0; multi[i].func; i++) {
-			va_start(args, fmt);
-			multi[i].func(multi[i].probe_private, call_private, fmt,
-				&args);
+			va_start(args, call_private);
+			multi[i].func(multi[i].probe_private, call_private,
+				mdata->format, &args);
 			va_end(args);
 		}
 	}
@@ -150,13 +149,11 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
  * marker_probe_cb Callback that does not prepare the variable argument list.
  * @mdata: pointer of type struct marker
  * @call_private: caller site private data
- * @fmt: format string
  * @...:  Variable argument list.
  *
  * Should be connected to markers "MARK_NOARGS".
  */
-void marker_probe_cb_noarg(const struct marker *mdata,
-	void *call_private, const char *fmt, ...)
+void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 {
 	va_list args;	/* not initialized */
 	char ptype;
@@ -172,7 +169,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
 		/* Must read the ptr before private data. They are not data
 		 * dependant, so we put an explicit smp_rmb() here. */
 		smp_rmb();
-		func(mdata->single.probe_private, call_private, fmt, &args);
+		func(mdata->single.probe_private, call_private, mdata->format,
+			&args);
 	} else {
 		struct marker_probe_closure *multi;
 		int i;
@@ -186,8 +184,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
 		smp_read_barrier_depends();
 		multi = mdata->multi;
 		for (i = 0; multi[i].func; i++)
-			multi[i].func(multi[i].probe_private, call_private, fmt,
-				&args);
+			multi[i].func(multi[i].probe_private, call_private,
+				mdata->format, &args);
 	}
 	preempt_enable();
 }
diff --git a/kernel/module.c b/kernel/module.c
index f5e9491ef7ac..5f80478b746d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1337,7 +1337,19 @@ out_unreg:
 	kobject_put(&mod->mkobj.kobj);
 	return err;
 }
-#endif
+
+static void mod_sysfs_fini(struct module *mod)
+{
+	kobject_put(&mod->mkobj.kobj);
+}
+
+#else /* CONFIG_SYSFS */
+
+static void mod_sysfs_fini(struct module *mod)
+{
+}
+
+#endif /* CONFIG_SYSFS */
 
 static void mod_kobject_remove(struct module *mod)
 {
@@ -1345,7 +1357,7 @@ static void mod_kobject_remove(struct module *mod)
 	module_param_sysfs_remove(mod);
 	kobject_put(mod->mkobj.drivers_dir);
 	kobject_put(mod->holders_dir);
-	kobject_put(&mod->mkobj.kobj);
+	mod_sysfs_fini(mod);
 }
 
 /*
@@ -1780,7 +1792,7 @@ static struct module *load_module(void __user *umod,
 
 	/* Sanity checks against insmoding binaries or wrong arch,
            weird elf version */
-	if (memcmp(hdr->e_ident, ELFMAG, 4) != 0
+	if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
 	    || hdr->e_type != ET_REL
 	    || !elf_check_arch(hdr)
 	    || hdr->e_shentsize != sizeof(*sechdrs)) {
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 3aaa06c561de..1d94160eb532 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -79,8 +79,8 @@ void debug_mutex_unlock(struct mutex *lock)
 	if (unlikely(!debug_locks))
 		return;
 
-	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
 	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 }
diff --git a/kernel/mutex.c b/kernel/mutex.c
index d046a345d365..9e98f985c5bf 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -373,8 +373,8 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
  * Try to acquire the mutex atomically. Returns 1 if the mutex
  * has been acquired successfully, and 0 on contention.
  *
- * NOTE: this function follows the spin_trylock() convention, so
- * it is negated to the down_trylock() return values! Be careful
+ * NOTE: this function follows the spin_trylock()/down_try() convention,
+ * so it is negated to the old down_trylock() return values! Be careful
  * about this when converting semaphore users to mutexes.
  *
  * This function must not be used in interrupt context. The
diff --git a/kernel/panic.c b/kernel/panic.c
index 425567f45b9f..6729e3f4ebcb 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -80,6 +80,9 @@ NORET_TYPE void panic(const char * fmt, ...)
 	vsnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
 	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+	dump_stack();
+#endif
 	bust_spinlocks(0);
 
 	/*
@@ -321,13 +324,85 @@ EXPORT_SYMBOL(warn_on_slowpath);
 #endif
 
 #ifdef CONFIG_CC_STACKPROTECTOR
+
+#ifndef GCC_HAS_SP
+#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this.
+#endif
+static unsigned long __stack_check_testing;
+/*
+ * Self test function for the stack-protector feature.
+ * This test requires that the local variable absolutely has
+ * a stack slot, hence the barrier()s.
+ */
+static noinline void __stack_chk_test_func(void)
+{
+	unsigned long foo;
+	barrier();
+	/*
+	 * we need to make sure we're not about to clobber the return address,
+	 * while real exploits do this, it's unhealthy on a running system.
+	 * Besides, if we would, the test is already failed anyway so
+	 * time to pull the emergency brake on it.
+	 */
+	if ((unsigned long)__builtin_return_address(0) ==
+					*(((unsigned long *)&foo)+1)) {
+		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
+		return;
+	}
+#ifdef CONFIG_FRAME_POINTER
+	/* We also don't want to clobber the frame pointer */
+	if ((unsigned long)__builtin_return_address(0) ==
+					*(((unsigned long *)&foo)+2)) {
+		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
+		return;
+	}
+#endif
+	barrier();
+	if (current->stack_canary == *(((unsigned long *)&foo)+1))
+		*(((unsigned long *)&foo)+1) = 0;
+	else
+		printk(KERN_ERR "No -fstack-protector canary found\n");
+	barrier();
+}
+
+static int __stack_chk_test(void)
+{
+	printk(KERN_INFO "Testing -fstack-protector-all feature\n");
+	__stack_check_testing = (unsigned long)&__stack_chk_test_func;
+	__stack_chk_test_func();
+	if (__stack_check_testing) {
+		printk(KERN_ERR "-fstack-protector-all test failed\n");
+		WARN_ON(1);
+	}
+	return 0;
+}
 /*
  * Called when gcc's -fstack-protector feature is used, and
  * gcc detects corruption of the on-stack canary value
  */
 void __stack_chk_fail(void)
 {
-	panic("stack-protector: Kernel stack is corrupted");
+	if (__stack_check_testing == (unsigned long)&__stack_chk_test_func) {
+		long delta;
+
+		delta = (unsigned long)__builtin_return_address(0) -
+				__stack_check_testing;
+		/*
+		 * The test needs to happen inside the test function, so
+		 * check if the return address is close to that function.
+		 * The function is only 2 dozen bytes long, but keep a wide
+		 * safety margin to avoid panic()s for normal users regardless
+		 * of the quality of the compiler.
+		 */
+		if (delta >= 0 && delta <= 400) {
+			__stack_check_testing = 0;
+			return;
+		}
+	}
+	panic("stack-protector: Kernel stack is corrupted in: %p\n",
+		__builtin_return_address(0));
 }
 EXPORT_SYMBOL(__stack_chk_fail);
+
+late_initcall(__stack_chk_test);
 #endif
diff --git a/kernel/pid.c b/kernel/pid.c
index 20d59fa2d493..30bd5d4b2ac7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -30,6 +30,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/rculist.h>
 #include <linux/bootmem.h>
 #include <linux/hash.h>
 #include <linux/pid_namespace.h>
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 14a656cdc652..d416be0efa8a 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -193,6 +193,7 @@ static int create_image(int platform_mode)
 	if (error)
 		return error;
 
+	device_pm_lock();
 	local_irq_disable();
 	/* At this point, device_suspend() has been called, but *not*
 	 * device_power_down(). We *must* call device_power_down() now.
@@ -224,9 +225,11 @@ static int create_image(int platform_mode)
 	/* NOTE:  device_power_up() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
-	device_power_up();
+	device_power_up(in_suspend ?
+		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
  Enable_irqs:
 	local_irq_enable();
+	device_pm_unlock();
 	return error;
 }
 
@@ -280,7 +283,8 @@ int hibernation_snapshot(int platform_mode)
  Finish:
 	platform_finish(platform_mode);
  Resume_devices:
-	device_resume();
+	device_resume(in_suspend ?
+		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
  Resume_console:
 	resume_console();
  Close:
@@ -300,8 +304,9 @@ static int resume_target_kernel(void)
 {
 	int error;
 
+	device_pm_lock();
 	local_irq_disable();
-	error = device_power_down(PMSG_PRETHAW);
+	error = device_power_down(PMSG_QUIESCE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting resume\n");
@@ -329,9 +334,10 @@ static int resume_target_kernel(void)
 	swsusp_free();
 	restore_processor_state();
 	touch_softlockup_watchdog();
-	device_power_up();
+	device_power_up(PMSG_RECOVER);
  Enable_irqs:
 	local_irq_enable();
+	device_pm_unlock();
 	return error;
 }
 
@@ -350,7 +356,7 @@ int hibernation_restore(int platform_mode)
 
 	pm_prepare_console();
 	suspend_console();
-	error = device_suspend(PMSG_PRETHAW);
+	error = device_suspend(PMSG_QUIESCE);
 	if (error)
 		goto Finish;
 
@@ -362,7 +368,7 @@ int hibernation_restore(int platform_mode)
 		enable_nonboot_cpus();
 	}
 	platform_restore_cleanup(platform_mode);
-	device_resume();
+	device_resume(PMSG_RECOVER);
  Finish:
 	resume_console();
 	pm_restore_console();
@@ -403,6 +409,7 @@ int hibernation_platform_enter(void)
 	if (error)
 		goto Finish;
 
+	device_pm_lock();
 	local_irq_disable();
 	error = device_power_down(PMSG_HIBERNATE);
 	if (!error) {
@@ -411,6 +418,7 @@ int hibernation_platform_enter(void)
 		while (1);
 	}
 	local_irq_enable();
+	device_pm_unlock();
 
 	/*
 	 * We don't need to reenable the nonboot CPUs or resume consoles, since
@@ -419,7 +427,7 @@ int hibernation_platform_enter(void)
  Finish:
 	hibernation_ops->finish();
  Resume_devices:
-	device_resume();
+	device_resume(PMSG_RESTORE);
  Resume_console:
 	resume_console();
  Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6a6d5eb3524e..d023b6b584e5 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -228,6 +228,7 @@ static int suspend_enter(suspend_state_t state)
 {
 	int error = 0;
 
+	device_pm_lock();
 	arch_suspend_disable_irqs();
 	BUG_ON(!irqs_disabled());
 
@@ -239,10 +240,11 @@ static int suspend_enter(suspend_state_t state)
 	if (!suspend_test(TEST_CORE))
 		error = suspend_ops->enter(state);
 
-	device_power_up();
+	device_power_up(PMSG_RESUME);
  Done:
 	arch_suspend_enable_irqs();
 	BUG_ON(irqs_disabled());
+	device_pm_unlock();
 	return error;
 }
 
@@ -291,7 +293,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 	if (suspend_ops->finish)
 		suspend_ops->finish();
  Resume_devices:
-	device_resume();
+	device_resume(PMSG_RESUME);
  Resume_console:
 	resume_console();
  Close:
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fb01c32aa3b..c5b4aea992a6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -121,6 +121,8 @@ struct console_cmdline
 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
 static int selected_console = -1;
 static int preferred_console = -1;
+int console_set_on_cmdline;
+EXPORT_SYMBOL(console_set_on_cmdline);
 
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
@@ -890,6 +892,7 @@ static int __init console_setup(char *str)
 	*s = 0;
 
 	__add_preferred_console(buf, idx, options, brl_options);
+	console_set_on_cmdline = 1;
 	return 1;
 }
 __setup("console=", console_setup);
@@ -986,7 +989,7 @@ EXPORT_SYMBOL(acquire_console_sem);
 
 int try_acquire_console_sem(void)
 {
-	if (down_trylock(&console_sem))
+	if (!down_try(&console_sem))
 		return -1;
 	console_locked = 1;
 	console_may_schedule = 0;
@@ -1041,7 +1044,9 @@ void release_console_sem(void)
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
 		spin_unlock(&logbuf_lock);
+		stop_critical_timings();	/* don't trace print latency */
 		call_console_drivers(_con_start, _log_end);
+		start_critical_timings();
 		local_irq_restore(flags);
 	}
 	console_locked = 0;
@@ -1083,7 +1088,7 @@ void console_unblank(void)
 	 * oops_in_progress is set to 1..
 	 */
 	if (oops_in_progress) {
-		if (down_trylock(&console_sem) != 0)
+		if (!down_try(&console_sem))
 			return;
 	} else
 		acquire_console_sem();
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6c19e94fd0a5..e337390fce01 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -121,7 +121,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	return ret;
 }
 
-int __ptrace_may_attach(struct task_struct *task)
+int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	/* May we inspect the given task?
 	 * This check is used both for attaching with ptrace
@@ -148,16 +148,16 @@ int __ptrace_may_attach(struct task_struct *task)
 	if (!dumpable && !capable(CAP_SYS_PTRACE))
 		return -EPERM;
 
-	return security_ptrace(current, task);
+	return security_ptrace(current, task, mode);
 }
 
-int ptrace_may_attach(struct task_struct *task)
+bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	int err;
 	task_lock(task);
-	err = __ptrace_may_attach(task);
+	err = __ptrace_may_access(task, mode);
 	task_unlock(task);
-	return !err;
+	return (!err ? true : false);
 }
 
 int ptrace_attach(struct task_struct *task)
@@ -195,7 +195,7 @@ repeat:
 	/* the same process cannot be attached many times */
 	if (task->ptrace & PT_PTRACED)
 		goto bad;
-	retval = __ptrace_may_attach(task);
+	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
 	if (retval)
 		goto bad;
 
@@ -494,7 +494,8 @@ int ptrace_traceme(void)
 	 */
 	task_lock(current);
 	if (!(current->ptrace & PT_PTRACED)) {
-		ret = security_ptrace(current->parent, current);
+		ret = security_ptrace(current->parent, current,
+				      PTRACE_MODE_ATTACH);
 		/*
 		 * Set the ptrace bit in the process ptrace flags.
 		 */
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index f4ffbd0f306f..d8348792f9f5 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -502,10 +502,38 @@ void rcu_check_callbacks(int cpu, int user)
 	if (user ||
 	    (idle_cpu(cpu) && !in_softirq() &&
 				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+
+		/*
+		 * Get here if this CPU took its interrupt from user
+		 * mode or from the idle loop, and if this is not a
+		 * nested interrupt.  In this case, the CPU is in
+		 * a quiescent state, so count it.
+		 *
+		 * Also do a memory barrier.  This is needed to handle
+		 * the case where writes from a preempt-disable section
+		 * of code get reordered into schedule() by this CPU's
+		 * write buffer.  The memory barrier makes sure that
+		 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
+		 * by other CPUs to happen after any such write.
+		 */
+
+		smp_mb();  /* See above block comment. */
 		rcu_qsctr_inc(cpu);
 		rcu_bh_qsctr_inc(cpu);
-	} else if (!in_softirq())
+
+	} else if (!in_softirq()) {
+
+		/*
+		 * Get here if this CPU did not take its interrupt from
+		 * softirq, in other words, if it is not interrupting
+		 * a rcu_bh read-side critical section.  This is an _bh
+		 * critical section, so count it.  The memory barrier
+		 * is needed for the same reason as is the above one.
+		 */
+
+		smp_mb();  /* See above block comment. */
 		rcu_bh_qsctr_inc(cpu);
+	}
 	raise_rcu_softirq();
 }
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c09605f8d16c..4a74b8d48d90 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -39,16 +39,16 @@
 #include <linux/sched.h>
 #include <asm/atomic.h>
 #include <linux/bitops.h>
-#include <linux/completion.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
 
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
+enum rcu_barrier {
+	RCU_BARRIER_STD,
+	RCU_BARRIER_BH,
+	RCU_BARRIER_SCHED,
 };
 
 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
@@ -60,7 +60,7 @@ static struct completion rcu_barrier_completion;
  * Awaken the corresponding synchronize_rcu() instance now that a
  * grace period has elapsed.
  */
-static void wakeme_after_rcu(struct rcu_head  *head)
+void wakeme_after_rcu(struct rcu_head  *head)
 {
 	struct rcu_synchronize *rcu;
 
@@ -77,17 +77,7 @@ static void wakeme_after_rcu(struct rcu_head  *head)
  * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
  * and may be nested.
  */
-void synchronize_rcu(void)
-{
-	struct rcu_synchronize rcu;
-
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished */
-	call_rcu(&rcu.head, wakeme_after_rcu);
-
-	/* Wait for it */
-	wait_for_completion(&rcu.completion);
-}
+synchronize_rcu_xxx(synchronize_rcu, call_rcu)
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
 static void rcu_barrier_callback(struct rcu_head *notused)
@@ -99,19 +89,30 @@ static void rcu_barrier_callback(struct rcu_head *notused)
 /*
  * Called with preemption disabled, and from cross-cpu IRQ context.
  */
-static void rcu_barrier_func(void *notused)
+static void rcu_barrier_func(void *type)
 {
 	int cpu = smp_processor_id();
 	struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
 
 	atomic_inc(&rcu_barrier_cpu_count);
-	call_rcu(head, rcu_barrier_callback);
+	switch ((enum rcu_barrier)type) {
+	case RCU_BARRIER_STD:
+		call_rcu(head, rcu_barrier_callback);
+		break;
+	case RCU_BARRIER_BH:
+		call_rcu_bh(head, rcu_barrier_callback);
+		break;
+	case RCU_BARRIER_SCHED:
+		call_rcu_sched(head, rcu_barrier_callback);
+		break;
+	}
 }
 
-/**
- * rcu_barrier - Wait until all the in-flight RCUs are complete.
+/*
+ * Orchestrate the specified type of RCU barrier, waiting for all
+ * RCU callbacks of the specified type to complete.
  */
-void rcu_barrier(void)
+static void _rcu_barrier(enum rcu_barrier type)
 {
 	BUG_ON(in_interrupt());
 	/* Take cpucontrol mutex to protect against CPU hotplug */
@@ -127,13 +128,39 @@ void rcu_barrier(void)
 	 * until all the callbacks are queued.
 	 */
 	rcu_read_lock();
-	on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+	on_each_cpu(rcu_barrier_func, (void *)type, 0, 1);
 	rcu_read_unlock();
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
 }
+
+/**
+ * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
+ */
+void rcu_barrier(void)
+{
+	_rcu_barrier(RCU_BARRIER_STD);
+}
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
+/**
+ * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
+ */
+void rcu_barrier_bh(void)
+{
+	_rcu_barrier(RCU_BARRIER_BH);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+
+/**
+ * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
+ */
+void rcu_barrier_sched(void)
+{
+	_rcu_barrier(RCU_BARRIER_SCHED);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_sched);
+
 void __init rcu_init(void)
 {
 	__rcu_init();
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e1cdf196a515..b8e4cdae4e8d 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -46,11 +46,11 @@
 #include <asm/atomic.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
-#include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/random.h>
 #include <linux/delay.h>
@@ -87,9 +87,14 @@ struct rcu_data {
 	struct rcu_head **nexttail;
 	struct rcu_head *waitlist[GP_STAGES];
 	struct rcu_head **waittail[GP_STAGES];
-	struct rcu_head *donelist;
+	struct rcu_head *donelist;	/* from waitlist & waitschedlist */
 	struct rcu_head **donetail;
 	long rcu_flipctr[2];
+	struct rcu_head *nextschedlist;
+	struct rcu_head **nextschedtail;
+	struct rcu_head *waitschedlist;
+	struct rcu_head **waitschedtail;
+	int rcu_sched_sleeping;
 #ifdef CONFIG_RCU_TRACE
 	struct rcupreempt_trace trace;
 #endif /* #ifdef CONFIG_RCU_TRACE */
@@ -131,11 +136,24 @@ enum rcu_try_flip_states {
 	rcu_try_flip_waitmb_state,
 };
 
+/*
+ * States for rcu_ctrlblk.rcu_sched_sleep.
+ */
+
+enum rcu_sched_sleep_states {
+	rcu_sched_not_sleeping,	/* Not sleeping, callbacks need GP.  */
+	rcu_sched_sleep_prep,	/* Thinking of sleeping, rechecking. */
+	rcu_sched_sleeping,	/* Sleeping, awaken if GP needed. */
+};
+
 struct rcu_ctrlblk {
 	spinlock_t	fliplock;	/* Protect state-machine transitions. */
 	long		completed;	/* Number of last completed batch. */
 	enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
 							the rcu state machine */
+	spinlock_t	schedlock;	/* Protect rcu_sched sleep state. */
+	enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
+	wait_queue_head_t sched_wq;	/* Place for rcu_sched to sleep. */
 };
 
 static DEFINE_PER_CPU(struct rcu_data, rcu_data);
@@ -143,8 +161,12 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
 	.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
 	.completed = 0,
 	.rcu_try_flip_state = rcu_try_flip_idle_state,
+	.schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
+	.sched_sleep = rcu_sched_not_sleeping,
+	.sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
 };
 
+static struct task_struct *rcu_sched_grace_period_task;
 
 #ifdef CONFIG_RCU_TRACE
 static char *rcu_try_flip_state_names[] =
@@ -207,6 +229,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
  */
 #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
 
+#define RCU_SCHED_BATCH_TIME (HZ / 50)
+
 /*
  * Return the number of RCU batches processed thus far.  Useful
  * for debug and statistics.
@@ -217,8 +241,6 @@ long rcu_batches_completed(void)
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 
-EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
-
 void __rcu_read_lock(void)
 {
 	int idx;
@@ -413,32 +435,34 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
 	}
 }
 
-#ifdef CONFIG_NO_HZ
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
+	.dynticks = 1,
+};
 
-DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
-static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
+#ifdef CONFIG_NO_HZ
 static DEFINE_PER_CPU(int, rcu_update_flag);
 
 /**
  * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
  *
  * If the CPU was idle with dynamic ticks active, this updates the
- * dynticks_progress_counter to let the RCU handling know that the
+ * rcu_dyntick_sched.dynticks to let the RCU handling know that the
  * CPU is active.
  */
 void rcu_irq_enter(void)
 {
 	int cpu = smp_processor_id();
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 
 	if (per_cpu(rcu_update_flag, cpu))
 		per_cpu(rcu_update_flag, cpu)++;
 
 	/*
 	 * Only update if we are coming from a stopped ticks mode
-	 * (dynticks_progress_counter is even).
+	 * (rcu_dyntick_sched.dynticks is even).
 	 */
 	if (!in_interrupt() &&
-	    (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
+	    (rdssp->dynticks & 0x1) == 0) {
 		/*
 		 * The following might seem like we could have a race
 		 * with NMI/SMIs. But this really isn't a problem.
@@ -461,12 +485,12 @@ void rcu_irq_enter(void)
 		 * RCU read-side critical sections on this CPU would
 		 * have already completed.
 		 */
-		per_cpu(dynticks_progress_counter, cpu)++;
+		rdssp->dynticks++;
 		/*
 		 * The following memory barrier ensures that any
 		 * rcu_read_lock() primitives in the irq handler
 		 * are seen by other CPUs to follow the above
-		 * increment to dynticks_progress_counter. This is
+		 * increment to rcu_dyntick_sched.dynticks. This is
 		 * required in order for other CPUs to correctly
 		 * determine when it is safe to advance the RCU
 		 * grace-period state machine.
@@ -474,7 +498,7 @@ void rcu_irq_enter(void)
 		smp_mb(); /* see above block comment. */
 		/*
 		 * Since we can't determine the dynamic tick mode from
-		 * the dynticks_progress_counter after this routine,
+		 * the rcu_dyntick_sched.dynticks after this routine,
 		 * we use a second flag to acknowledge that we came
 		 * from an idle state with ticks stopped.
 		 */
@@ -482,7 +506,7 @@ void rcu_irq_enter(void)
 		/*
 		 * If we take an NMI/SMI now, they will also increment
 		 * the rcu_update_flag, and will not update the
-		 * dynticks_progress_counter on exit. That is for
+		 * rcu_dyntick_sched.dynticks on exit. That is for
 		 * this IRQ to do.
 		 */
 	}
@@ -492,12 +516,13 @@ void rcu_irq_enter(void)
  * rcu_irq_exit - Called from exiting Hard irq context.
  *
  * If the CPU was idle with dynamic ticks active, update the
- * dynticks_progress_counter to put let the RCU handling be
+ * rcu_dyntick_sched.dynticks to put let the RCU handling be
  * aware that the CPU is going back to idle with no ticks.
  */
 void rcu_irq_exit(void)
 {
 	int cpu = smp_processor_id();
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 
 	/*
 	 * rcu_update_flag is set if we interrupted the CPU
@@ -505,7 +530,7 @@ void rcu_irq_exit(void)
 	 * Once this occurs, we keep track of interrupt nesting
 	 * because a NMI/SMI could also come in, and we still
 	 * only want the IRQ that started the increment of the
-	 * dynticks_progress_counter to be the one that modifies
+	 * rcu_dyntick_sched.dynticks to be the one that modifies
 	 * it on exit.
 	 */
 	if (per_cpu(rcu_update_flag, cpu)) {
@@ -517,28 +542,29 @@ void rcu_irq_exit(void)
 
 		/*
 		 * If an NMI/SMI happens now we are still
-		 * protected by the dynticks_progress_counter being odd.
+		 * protected by the rcu_dyntick_sched.dynticks being odd.
 		 */
 
 		/*
 		 * The following memory barrier ensures that any
 		 * rcu_read_unlock() primitives in the irq handler
 		 * are seen by other CPUs to preceed the following
-		 * increment to dynticks_progress_counter. This
+		 * increment to rcu_dyntick_sched.dynticks. This
 		 * is required in order for other CPUs to determine
 		 * when it is safe to advance the RCU grace-period
 		 * state machine.
 		 */
 		smp_mb(); /* see above block comment. */
-		per_cpu(dynticks_progress_counter, cpu)++;
-		WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
+		rdssp->dynticks++;
+		WARN_ON(rdssp->dynticks & 0x1);
 	}
 }
 
 static void dyntick_save_progress_counter(int cpu)
 {
-	per_cpu(rcu_dyntick_snapshot, cpu) =
-		per_cpu(dynticks_progress_counter, cpu);
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	rdssp->dynticks_snap = rdssp->dynticks;
 }
 
 static inline int
@@ -546,9 +572,10 @@ rcu_try_flip_waitack_needed(int cpu)
 {
 	long curr;
 	long snap;
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 
-	curr = per_cpu(dynticks_progress_counter, cpu);
-	snap = per_cpu(rcu_dyntick_snapshot, cpu);
+	curr = rdssp->dynticks;
+	snap = rdssp->dynticks_snap;
 	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
 
 	/*
@@ -569,7 +596,7 @@ rcu_try_flip_waitack_needed(int cpu)
 	 * that this CPU already acknowledged the counter.
 	 */
 
-	if ((curr - snap) > 2 || (snap & 0x1) == 0)
+	if ((curr - snap) > 2 || (curr & 0x1) == 0)
 		return 0;
 
 	/* We need this CPU to explicitly acknowledge the counter flip. */
@@ -582,9 +609,10 @@ rcu_try_flip_waitmb_needed(int cpu)
 {
 	long curr;
 	long snap;
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 
-	curr = per_cpu(dynticks_progress_counter, cpu);
-	snap = per_cpu(rcu_dyntick_snapshot, cpu);
+	curr = rdssp->dynticks;
+	snap = rdssp->dynticks_snap;
 	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
 
 	/*
@@ -611,14 +639,86 @@ rcu_try_flip_waitmb_needed(int cpu)
 	return 1;
 }
 
+static void dyntick_save_progress_counter_sched(int cpu)
+{
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	rdssp->sched_dynticks_snap = rdssp->dynticks;
+}
+
+static int rcu_qsctr_inc_needed_dyntick(int cpu)
+{
+	long curr;
+	long snap;
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	curr = rdssp->dynticks;
+	snap = rdssp->sched_dynticks_snap;
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU remained in dynticks mode for the entire time
+	 * and didn't take any interrupts, NMIs, SMIs, or whatever,
+	 * then it cannot be in the middle of an rcu_read_lock(), so
+	 * the next rcu_read_lock() it executes must use the new value
+	 * of the counter.  Therefore, this CPU has been in a quiescent
+	 * state the entire time, and we don't need to wait for it.
+	 */
+
+	if ((curr == snap) && ((curr & 0x1) == 0))
+		return 0;
+
+	/*
+	 * If the CPU passed through or entered a dynticks idle phase with
+	 * no active irq handlers, then, as above, this CPU has already
+	 * passed through a quiescent state.
+	 */
+
+	if ((curr - snap) > 2 || (snap & 0x1) == 0)
+		return 0;
+
+	/* We need this CPU to go through a quiescent state. */
+
+	return 1;
+}
+
 #else /* !CONFIG_NO_HZ */
 
-# define dyntick_save_progress_counter(cpu)	do { } while (0)
-# define rcu_try_flip_waitack_needed(cpu)	(1)
-# define rcu_try_flip_waitmb_needed(cpu)	(1)
+# define dyntick_save_progress_counter(cpu)		do { } while (0)
+# define rcu_try_flip_waitack_needed(cpu)		(1)
+# define rcu_try_flip_waitmb_needed(cpu)		(1)
+
+# define dyntick_save_progress_counter_sched(cpu)	do { } while (0)
+# define rcu_qsctr_inc_needed_dyntick(cpu)		(1)
 
 #endif /* CONFIG_NO_HZ */
 
+static void save_qsctr_sched(int cpu)
+{
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	rdssp->sched_qs_snap = rdssp->sched_qs;
+}
+
+static inline int rcu_qsctr_inc_needed(int cpu)
+{
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	/*
+	 * If there has been a quiescent state, no more need to wait
+	 * on this CPU.
+	 */
+
+	if (rdssp->sched_qs != rdssp->sched_qs_snap) {
+		smp_mb(); /* force ordering with cpu entering schedule(). */
+		return 0;
+	}
+
+	/* We need this CPU to go through a quiescent state. */
+
+	return 1;
+}
+
 /*
  * Get here when RCU is idle.  Decide whether we need to
  * move out of idle state, and return non-zero if so.
@@ -821,6 +921,26 @@ void rcu_check_callbacks(int cpu, int user)
 	unsigned long flags;
 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 
+	/*
+	 * If this CPU took its interrupt from user mode or from the
+	 * idle loop, and this is not a nested interrupt, then
+	 * this CPU has to have exited all prior preept-disable
+	 * sections of code.  So increment the counter to note this.
+	 *
+	 * The memory barrier is needed to handle the case where
+	 * writes from a preempt-disable section of code get reordered
+	 * into schedule() by this CPU's write buffer.  So the memory
+	 * barrier makes sure that the rcu_qsctr_inc() is seen by other
+	 * CPUs to happen after any such write.
+	 */
+
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+	     hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+		smp_mb();	/* Guard against aggressive schedule(). */
+	     	rcu_qsctr_inc(cpu);
+	}
+
 	rcu_check_mb(cpu);
 	if (rcu_ctrlblk.completed == rdp->completed)
 		rcu_try_flip();
@@ -871,6 +991,8 @@ void rcu_offline_cpu(int cpu)
 	struct rcu_head *list = NULL;
 	unsigned long flags;
 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+	struct rcu_head *schedlist = NULL;
+	struct rcu_head **schedtail = &schedlist;
 	struct rcu_head **tail = &list;
 
 	/*
@@ -884,6 +1006,11 @@ void rcu_offline_cpu(int cpu)
 		rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
 						list, tail);
 	rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
+	rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
+				schedlist, schedtail);
+	rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
+				schedlist, schedtail);
+	rdp->rcu_sched_sleeping = 0;
 	spin_unlock_irqrestore(&rdp->lock, flags);
 	rdp->waitlistcount = 0;
 
@@ -918,22 +1045,40 @@ void rcu_offline_cpu(int cpu)
 	 * fix.
 	 */
 
-	local_irq_save(flags);
+	local_irq_save(flags);  /* disable preempt till we know what lock. */
 	rdp = RCU_DATA_ME();
 	spin_lock(&rdp->lock);
 	*rdp->nexttail = list;
 	if (list)
 		rdp->nexttail = tail;
+	*rdp->nextschedtail = schedlist;
+	if (schedlist)
+		rdp->nextschedtail = schedtail;
 	spin_unlock_irqrestore(&rdp->lock, flags);
 }
 
 void __devinit rcu_online_cpu(int cpu)
 {
 	unsigned long flags;
+	struct rcu_data *rdp;
 
 	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
 	cpu_set(cpu, rcu_cpu_online_map);
 	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+
+	/*
+	 * The rcu_sched grace-period processing might have bypassed
+	 * this CPU, given that it was not in the rcu_cpu_online_map
+	 * when the grace-period scan started.  This means that the
+	 * grace-period task might sleep.  So make sure that if this
+	 * should happen, the first callback posted to this CPU will
+	 * wake up the grace-period task if need be.
+	 */
+
+	rdp = RCU_DATA_CPU(cpu);
+	spin_lock_irqsave(&rdp->lock, flags);
+	rdp->rcu_sched_sleeping = 1;
+	spin_unlock_irqrestore(&rdp->lock, flags);
 }
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -988,31 +1133,196 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 	*rdp->nexttail = head;
 	rdp->nexttail = &head->next;
 	RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
-	spin_unlock(&rdp->lock);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&rdp->lock, flags);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+	int wake_gp = 0;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = RCU_DATA_ME();
+	spin_lock(&rdp->lock);
+	*rdp->nextschedtail = head;
+	rdp->nextschedtail = &head->next;
+	if (rdp->rcu_sched_sleeping) {
+
+		/* Grace-period processing might be sleeping... */
+
+		rdp->rcu_sched_sleeping = 0;
+		wake_gp = 1;
+	}
+	spin_unlock_irqrestore(&rdp->lock, flags);
+	if (wake_gp) {
+
+		/* Wake up grace-period processing, unless someone beat us. */
+
+		spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+		if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
+			wake_gp = 0;
+		rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
+		spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+		if (wake_gp)
+			wake_up_interruptible(&rcu_ctrlblk.sched_wq);
+	}
+}
+EXPORT_SYMBOL_GPL(call_rcu_sched);
+
 /*
  * Wait until all currently running preempt_disable() code segments
  * (including hardware-irq-disable segments) complete.  Note that
  * in -rt this does -not- necessarily result in all currently executing
  * interrupt -handlers- having completed.
  */
-void __synchronize_sched(void)
+synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched)
+EXPORT_SYMBOL_GPL(__synchronize_sched);
+
+/*
+ * kthread function that manages call_rcu_sched grace periods.
+ */
+static int rcu_sched_grace_period(void *arg)
 {
-	cpumask_t oldmask;
+	int couldsleep;		/* might sleep after current pass. */
+	int couldsleepnext = 0; /* might sleep after next pass. */
 	int cpu;
+	unsigned long flags;
+	struct rcu_data *rdp;
+	int ret;
 
-	if (sched_getaffinity(0, &oldmask) < 0)
-		oldmask = cpu_possible_map;
-	for_each_online_cpu(cpu) {
-		sched_setaffinity(0, &cpumask_of_cpu(cpu));
-		schedule();
-	}
-	sched_setaffinity(0, &oldmask);
+	/*
+	 * Each pass through the following loop handles one
+	 * rcu_sched grace period cycle.
+	 */
+	do {
+		/* Save each CPU's current state. */
+
+		for_each_online_cpu(cpu) {
+			dyntick_save_progress_counter_sched(cpu);
+			save_qsctr_sched(cpu);
+		}
+
+		/*
+		 * Sleep for about an RCU grace-period's worth to
+		 * allow better batching and to consume less CPU.
+		 */
+		schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
+
+		/*
+		 * If there was nothing to do last time, prepare to
+		 * sleep at the end of the current grace period cycle.
+		 */
+		couldsleep = couldsleepnext;
+		couldsleepnext = 1;
+		if (couldsleep) {
+			spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+			rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
+			spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+		}
+
+		/*
+		 * Wait on each CPU in turn to have either visited
+		 * a quiescent state or been in dynticks-idle mode.
+		 */
+		for_each_online_cpu(cpu) {
+			while (rcu_qsctr_inc_needed(cpu) &&
+			       rcu_qsctr_inc_needed_dyntick(cpu)) {
+				/* resched_cpu(cpu); @@@ */
+				schedule_timeout_interruptible(1);
+			}
+		}
+
+		/* Advance callbacks for each CPU.  */
+
+		for_each_online_cpu(cpu) {
+
+			rdp = RCU_DATA_CPU(cpu);
+			spin_lock_irqsave(&rdp->lock, flags);
+
+			/*
+			 * We are running on this CPU irq-disabled, so no
+			 * CPU can go offline until we re-enable irqs.
+			 * The current CPU might have already gone
+			 * offline (between the for_each_offline_cpu and
+			 * the spin_lock_irqsave), but in that case all its
+			 * callback lists will be empty, so no harm done.
+			 *
+			 * Advance the callbacks!  We share normal RCU's
+			 * donelist, since callbacks are invoked the
+			 * same way in either case.
+			 */
+			if (rdp->waitschedlist != NULL) {
+				*rdp->donetail = rdp->waitschedlist;
+				rdp->donetail = rdp->waitschedtail;
+
+				/*
+				 * Next rcu_check_callbacks() will
+				 * do the required raise_softirq().
+				 */
+			}
+			if (rdp->nextschedlist != NULL) {
+				rdp->waitschedlist = rdp->nextschedlist;
+				rdp->waitschedtail = rdp->nextschedtail;
+				couldsleep = 0;
+				couldsleepnext = 0;
+			} else {
+				rdp->waitschedlist = NULL;
+				rdp->waitschedtail = &rdp->waitschedlist;
+			}
+			rdp->nextschedlist = NULL;
+			rdp->nextschedtail = &rdp->nextschedlist;
+
+			/* Mark sleep intention. */
+
+			rdp->rcu_sched_sleeping = couldsleep;
+
+			spin_unlock_irqrestore(&rdp->lock, flags);
+		}
+
+		/* If we saw callbacks on the last scan, go deal with them. */
+
+		if (!couldsleep)
+			continue;
+
+		/* Attempt to block... */
+
+		spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+		if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
+
+			/*
+			 * Someone posted a callback after we scanned.
+			 * Go take care of it.
+			 */
+			spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+			couldsleepnext = 0;
+			continue;
+		}
+
+		/* Block until the next person posts a callback. */
+
+		rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
+		spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+		ret = 0;
+		__wait_event_interruptible(rcu_ctrlblk.sched_wq,
+			rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
+			ret);
+
+		/*
+		 * Signals would prevent us from sleeping, and we cannot
+		 * do much with them in any case.  So flush them.
+		 */
+		if (ret)
+			flush_signals(current);
+		couldsleepnext = 0;
+
+	} while (!kthread_should_stop());
+
+	return (0);
 }
-EXPORT_SYMBOL_GPL(__synchronize_sched);
 
 /*
  * Check to see if any future RCU-related work will need to be done
@@ -1029,7 +1339,9 @@ int rcu_needs_cpu(int cpu)
 
 	return (rdp->donelist != NULL ||
 		!!rdp->waitlistcount ||
-		rdp->nextlist != NULL);
+		rdp->nextlist != NULL ||
+		rdp->nextschedlist != NULL ||
+		rdp->waitschedlist != NULL);
 }
 
 int rcu_pending(int cpu)
@@ -1040,7 +1352,9 @@ int rcu_pending(int cpu)
 
 	if (rdp->donelist != NULL ||
 	    !!rdp->waitlistcount ||
-	    rdp->nextlist != NULL)
+	    rdp->nextlist != NULL ||
+	    rdp->nextschedlist != NULL ||
+	    rdp->waitschedlist != NULL)
 		return 1;
 
 	/* The RCU core needs an acknowledgement from this CPU. */
@@ -1107,6 +1421,11 @@ void __init __rcu_init(void)
 		rdp->donetail = &rdp->donelist;
 		rdp->rcu_flipctr[0] = 0;
 		rdp->rcu_flipctr[1] = 0;
+		rdp->nextschedlist = NULL;
+		rdp->nextschedtail = &rdp->nextschedlist;
+		rdp->waitschedlist = NULL;
+		rdp->waitschedtail = &rdp->waitschedlist;
+		rdp->rcu_sched_sleeping = 0;
 	}
 	register_cpu_notifier(&rcu_nb);
 
@@ -1129,11 +1448,15 @@ void __init __rcu_init(void)
 }
 
 /*
- * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ * Late-boot-time RCU initialization that must wait until after scheduler
+ * has been initialized.
  */
-void synchronize_kernel(void)
+void __init rcu_init_sched(void)
 {
-	synchronize_rcu();
+	rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
+						  NULL,
+						  "rcu_sched_grace_period");
+	WARN_ON(IS_ERR(rcu_sched_grace_period_task));
 }
 
 #ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 49ac4947af24..5edf82c34bbc 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -38,7 +38,6 @@
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
-#include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/rcupreempt_trace.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
deleted file mode 100644
index 33acc424667e..000000000000
--- a/kernel/rcutorture.c
+++ /dev/null
@@ -1,999 +0,0 @@
-/*
- * Read-Copy Update module-based torture test facility
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2005, 2006
- *
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
- *          Josh Triplett <josh@freedesktop.org>
- *
- * See also:  Documentation/RCU/torture.txt
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/kthread.h>
-#include <linux/err.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <asm/atomic.h>
-#include <linux/bitops.h>
-#include <linux/completion.h>
-#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/freezer.h>
-#include <linux/cpu.h>
-#include <linux/delay.h>
-#include <linux/byteorder/swabb.h>
-#include <linux/stat.h>
-#include <linux/srcu.h>
-#include <linux/slab.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
-              "Josh Triplett <josh@freedesktop.org>");
-
-static int nreaders = -1;	/* # reader threads, defaults to 2*ncpus */
-static int nfakewriters = 4;	/* # fake writer threads */
-static int stat_interval;	/* Interval between stats, in seconds. */
-				/*  Defaults to "only at end of test". */
-static int verbose;		/* Print more debug info. */
-static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
-static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
-static char *torture_type = "rcu"; /* What RCU implementation to torture. */
-
-module_param(nreaders, int, 0444);
-MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
-module_param(nfakewriters, int, 0444);
-MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
-module_param(stat_interval, int, 0444);
-MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
-module_param(verbose, bool, 0444);
-MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
-module_param(test_no_idle_hz, bool, 0444);
-MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
-module_param(shuffle_interval, int, 0444);
-MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
-module_param(torture_type, charp, 0444);
-MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
-
-#define TORTURE_FLAG "-torture:"
-#define PRINTK_STRING(s) \
-	do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_PRINTK_STRING(s) \
-	do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_PRINTK_ERRSTRING(s) \
-	do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
-
-static char printk_buf[4096];
-
-static int nrealreaders;
-static struct task_struct *writer_task;
-static struct task_struct **fakewriter_tasks;
-static struct task_struct **reader_tasks;
-static struct task_struct *stats_task;
-static struct task_struct *shuffler_task;
-
-#define RCU_TORTURE_PIPE_LEN 10
-
-struct rcu_torture {
-	struct rcu_head rtort_rcu;
-	int rtort_pipe_count;
-	struct list_head rtort_free;
-	int rtort_mbtest;
-};
-
-static int fullstop = 0;	/* stop generating callbacks at test end. */
-static LIST_HEAD(rcu_torture_freelist);
-static struct rcu_torture *rcu_torture_current = NULL;
-static long rcu_torture_current_version = 0;
-static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
-static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
-	{ 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
-	{ 0 };
-static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
-static atomic_t n_rcu_torture_alloc;
-static atomic_t n_rcu_torture_alloc_fail;
-static atomic_t n_rcu_torture_free;
-static atomic_t n_rcu_torture_mberror;
-static atomic_t n_rcu_torture_error;
-static struct list_head rcu_torture_removed;
-
-/*
- * Allocate an element from the rcu_tortures pool.
- */
-static struct rcu_torture *
-rcu_torture_alloc(void)
-{
-	struct list_head *p;
-
-	spin_lock_bh(&rcu_torture_lock);
-	if (list_empty(&rcu_torture_freelist)) {
-		atomic_inc(&n_rcu_torture_alloc_fail);
-		spin_unlock_bh(&rcu_torture_lock);
-		return NULL;
-	}
-	atomic_inc(&n_rcu_torture_alloc);
-	p = rcu_torture_freelist.next;
-	list_del_init(p);
-	spin_unlock_bh(&rcu_torture_lock);
-	return container_of(p, struct rcu_torture, rtort_free);
-}
-
-/*
- * Free an element to the rcu_tortures pool.
- */
-static void
-rcu_torture_free(struct rcu_torture *p)
-{
-	atomic_inc(&n_rcu_torture_free);
-	spin_lock_bh(&rcu_torture_lock);
-	list_add_tail(&p->rtort_free, &rcu_torture_freelist);
-	spin_unlock_bh(&rcu_torture_lock);
-}
-
-struct rcu_random_state {
-	unsigned long rrs_state;
-	long rrs_count;
-};
-
-#define RCU_RANDOM_MULT 39916801  /* prime */
-#define RCU_RANDOM_ADD	479001701 /* prime */
-#define RCU_RANDOM_REFRESH 10000
-
-#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
-
-/*
- * Crude but fast random-number generator.  Uses a linear congruential
- * generator, with occasional help from cpu_clock().
- */
-static unsigned long
-rcu_random(struct rcu_random_state *rrsp)
-{
-	if (--rrsp->rrs_count < 0) {
-		rrsp->rrs_state +=
-			(unsigned long)cpu_clock(raw_smp_processor_id());
-		rrsp->rrs_count = RCU_RANDOM_REFRESH;
-	}
-	rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
-	return swahw32(rrsp->rrs_state);
-}
-
-/*
- * Operations vector for selecting different types of tests.
- */
-
-struct rcu_torture_ops {
-	void (*init)(void);
-	void (*cleanup)(void);
-	int (*readlock)(void);
-	void (*readdelay)(struct rcu_random_state *rrsp);
-	void (*readunlock)(int idx);
-	int (*completed)(void);
-	void (*deferredfree)(struct rcu_torture *p);
-	void (*sync)(void);
-	int (*stats)(char *page);
-	char *name;
-};
-static struct rcu_torture_ops *cur_ops = NULL;
-
-/*
- * Definitions for rcu torture testing.
- */
-
-static int rcu_torture_read_lock(void) __acquires(RCU)
-{
-	rcu_read_lock();
-	return 0;
-}
-
-static void rcu_read_delay(struct rcu_random_state *rrsp)
-{
-	long delay;
-	const long longdelay = 200;
-
-	/* We want there to be long-running readers, but not all the time. */
-
-	delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay);
-	if (!delay)
-		udelay(longdelay);
-}
-
-static void rcu_torture_read_unlock(int idx) __releases(RCU)
-{
-	rcu_read_unlock();
-}
-
-static int rcu_torture_completed(void)
-{
-	return rcu_batches_completed();
-}
-
-static void
-rcu_torture_cb(struct rcu_head *p)
-{
-	int i;
-	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
-
-	if (fullstop) {
-		/* Test is ending, just drop callbacks on the floor. */
-		/* The next initialization will pick up the pieces. */
-		return;
-	}
-	i = rp->rtort_pipe_count;
-	if (i > RCU_TORTURE_PIPE_LEN)
-		i = RCU_TORTURE_PIPE_LEN;
-	atomic_inc(&rcu_torture_wcount[i]);
-	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
-		rp->rtort_mbtest = 0;
-		rcu_torture_free(rp);
-	} else
-		cur_ops->deferredfree(rp);
-}
-
-static void rcu_torture_deferred_free(struct rcu_torture *p)
-{
-	call_rcu(&p->rtort_rcu, rcu_torture_cb);
-}
-
-static struct rcu_torture_ops rcu_ops = {
-	.init = NULL,
-	.cleanup = NULL,
-	.readlock = rcu_torture_read_lock,
-	.readdelay = rcu_read_delay,
-	.readunlock = rcu_torture_read_unlock,
-	.completed = rcu_torture_completed,
-	.deferredfree = rcu_torture_deferred_free,
-	.sync = synchronize_rcu,
-	.stats = NULL,
-	.name = "rcu"
-};
-
-static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
-{
-	int i;
-	struct rcu_torture *rp;
-	struct rcu_torture *rp1;
-
-	cur_ops->sync();
-	list_add(&p->rtort_free, &rcu_torture_removed);
-	list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
-		i = rp->rtort_pipe_count;
-		if (i > RCU_TORTURE_PIPE_LEN)
-			i = RCU_TORTURE_PIPE_LEN;
-		atomic_inc(&rcu_torture_wcount[i]);
-		if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
-			rp->rtort_mbtest = 0;
-			list_del(&rp->rtort_free);
-			rcu_torture_free(rp);
-		}
-	}
-}
-
-static void rcu_sync_torture_init(void)
-{
-	INIT_LIST_HEAD(&rcu_torture_removed);
-}
-
-static struct rcu_torture_ops rcu_sync_ops = {
-	.init = rcu_sync_torture_init,
-	.cleanup = NULL,
-	.readlock = rcu_torture_read_lock,
-	.readdelay = rcu_read_delay,
-	.readunlock = rcu_torture_read_unlock,
-	.completed = rcu_torture_completed,
-	.deferredfree = rcu_sync_torture_deferred_free,
-	.sync = synchronize_rcu,
-	.stats = NULL,
-	.name = "rcu_sync"
-};
-
-/*
- * Definitions for rcu_bh torture testing.
- */
-
-static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
-{
-	rcu_read_lock_bh();
-	return 0;
-}
-
-static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
-{
-	rcu_read_unlock_bh();
-}
-
-static int rcu_bh_torture_completed(void)
-{
-	return rcu_batches_completed_bh();
-}
-
-static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
-{
-	call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
-}
-
-struct rcu_bh_torture_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-
-static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
-{
-	struct rcu_bh_torture_synchronize *rcu;
-
-	rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
-	complete(&rcu->completion);
-}
-
-static void rcu_bh_torture_synchronize(void)
-{
-	struct rcu_bh_torture_synchronize rcu;
-
-	init_completion(&rcu.completion);
-	call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
-	wait_for_completion(&rcu.completion);
-}
-
-static struct rcu_torture_ops rcu_bh_ops = {
-	.init = NULL,
-	.cleanup = NULL,
-	.readlock = rcu_bh_torture_read_lock,
-	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
-	.readunlock = rcu_bh_torture_read_unlock,
-	.completed = rcu_bh_torture_completed,
-	.deferredfree = rcu_bh_torture_deferred_free,
-	.sync = rcu_bh_torture_synchronize,
-	.stats = NULL,
-	.name = "rcu_bh"
-};
-
-static struct rcu_torture_ops rcu_bh_sync_ops = {
-	.init = rcu_sync_torture_init,
-	.cleanup = NULL,
-	.readlock = rcu_bh_torture_read_lock,
-	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
-	.readunlock = rcu_bh_torture_read_unlock,
-	.completed = rcu_bh_torture_completed,
-	.deferredfree = rcu_sync_torture_deferred_free,
-	.sync = rcu_bh_torture_synchronize,
-	.stats = NULL,
-	.name = "rcu_bh_sync"
-};
-
-/*
- * Definitions for srcu torture testing.
- */
-
-static struct srcu_struct srcu_ctl;
-
-static void srcu_torture_init(void)
-{
-	init_srcu_struct(&srcu_ctl);
-	rcu_sync_torture_init();
-}
-
-static void srcu_torture_cleanup(void)
-{
-	synchronize_srcu(&srcu_ctl);
-	cleanup_srcu_struct(&srcu_ctl);
-}
-
-static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
-{
-	return srcu_read_lock(&srcu_ctl);
-}
-
-static void srcu_read_delay(struct rcu_random_state *rrsp)
-{
-	long delay;
-	const long uspertick = 1000000 / HZ;
-	const long longdelay = 10;
-
-	/* We want there to be long-running readers, but not all the time. */
-
-	delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
-	if (!delay)
-		schedule_timeout_interruptible(longdelay);
-}
-
-static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
-{
-	srcu_read_unlock(&srcu_ctl, idx);
-}
-
-static int srcu_torture_completed(void)
-{
-	return srcu_batches_completed(&srcu_ctl);
-}
-
-static void srcu_torture_synchronize(void)
-{
-	synchronize_srcu(&srcu_ctl);
-}
-
-static int srcu_torture_stats(char *page)
-{
-	int cnt = 0;
-	int cpu;
-	int idx = srcu_ctl.completed & 0x1;
-
-	cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
-		       torture_type, TORTURE_FLAG, idx);
-	for_each_possible_cpu(cpu) {
-		cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu,
-			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
-			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
-	}
-	cnt += sprintf(&page[cnt], "\n");
-	return cnt;
-}
-
-static struct rcu_torture_ops srcu_ops = {
-	.init = srcu_torture_init,
-	.cleanup = srcu_torture_cleanup,
-	.readlock = srcu_torture_read_lock,
-	.readdelay = srcu_read_delay,
-	.readunlock = srcu_torture_read_unlock,
-	.completed = srcu_torture_completed,
-	.deferredfree = rcu_sync_torture_deferred_free,
-	.sync = srcu_torture_synchronize,
-	.stats = srcu_torture_stats,
-	.name = "srcu"
-};
-
-/*
- * Definitions for sched torture testing.
- */
-
-static int sched_torture_read_lock(void)
-{
-	preempt_disable();
-	return 0;
-}
-
-static void sched_torture_read_unlock(int idx)
-{
-	preempt_enable();
-}
-
-static int sched_torture_completed(void)
-{
-	return 0;
-}
-
-static void sched_torture_synchronize(void)
-{
-	synchronize_sched();
-}
-
-static struct rcu_torture_ops sched_ops = {
-	.init = rcu_sync_torture_init,
-	.cleanup = NULL,
-	.readlock = sched_torture_read_lock,
-	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
-	.readunlock = sched_torture_read_unlock,
-	.completed = sched_torture_completed,
-	.deferredfree = rcu_sync_torture_deferred_free,
-	.sync = sched_torture_synchronize,
-	.stats = NULL,
-	.name = "sched"
-};
-
-/*
- * RCU torture writer kthread.  Repeatedly substitutes a new structure
- * for that pointed to by rcu_torture_current, freeing the old structure
- * after a series of grace periods (the "pipeline").
- */
-static int
-rcu_torture_writer(void *arg)
-{
-	int i;
-	long oldbatch = rcu_batches_completed();
-	struct rcu_torture *rp;
-	struct rcu_torture *old_rp;
-	static DEFINE_RCU_RANDOM(rand);
-
-	VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
-	set_user_nice(current, 19);
-
-	do {
-		schedule_timeout_uninterruptible(1);
-		if ((rp = rcu_torture_alloc()) == NULL)
-			continue;
-		rp->rtort_pipe_count = 0;
-		udelay(rcu_random(&rand) & 0x3ff);
-		old_rp = rcu_torture_current;
-		rp->rtort_mbtest = 1;
-		rcu_assign_pointer(rcu_torture_current, rp);
-		smp_wmb();
-		if (old_rp) {
-			i = old_rp->rtort_pipe_count;
-			if (i > RCU_TORTURE_PIPE_LEN)
-				i = RCU_TORTURE_PIPE_LEN;
-			atomic_inc(&rcu_torture_wcount[i]);
-			old_rp->rtort_pipe_count++;
-			cur_ops->deferredfree(old_rp);
-		}
-		rcu_torture_current_version++;
-		oldbatch = cur_ops->completed();
-	} while (!kthread_should_stop() && !fullstop);
-	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
-	while (!kthread_should_stop())
-		schedule_timeout_uninterruptible(1);
-	return 0;
-}
-
-/*
- * RCU torture fake writer kthread.  Repeatedly calls sync, with a random
- * delay between calls.
- */
-static int
-rcu_torture_fakewriter(void *arg)
-{
-	DEFINE_RCU_RANDOM(rand);
-
-	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
-	set_user_nice(current, 19);
-
-	do {
-		schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
-		udelay(rcu_random(&rand) & 0x3ff);
-		cur_ops->sync();
-	} while (!kthread_should_stop() && !fullstop);
-
-	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
-	while (!kthread_should_stop())
-		schedule_timeout_uninterruptible(1);
-	return 0;
-}
-
-/*
- * RCU torture reader kthread.  Repeatedly dereferences rcu_torture_current,
- * incrementing the corresponding element of the pipeline array.  The
- * counter in the element should never be greater than 1, otherwise, the
- * RCU implementation is broken.
- */
-static int
-rcu_torture_reader(void *arg)
-{
-	int completed;
-	int idx;
-	DEFINE_RCU_RANDOM(rand);
-	struct rcu_torture *p;
-	int pipe_count;
-
-	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
-	set_user_nice(current, 19);
-
-	do {
-		idx = cur_ops->readlock();
-		completed = cur_ops->completed();
-		p = rcu_dereference(rcu_torture_current);
-		if (p == NULL) {
-			/* Wait for rcu_torture_writer to get underway */
-			cur_ops->readunlock(idx);
-			schedule_timeout_interruptible(HZ);
-			continue;
-		}
-		if (p->rtort_mbtest == 0)
-			atomic_inc(&n_rcu_torture_mberror);
-		cur_ops->readdelay(&rand);
-		preempt_disable();
-		pipe_count = p->rtort_pipe_count;
-		if (pipe_count > RCU_TORTURE_PIPE_LEN) {
-			/* Should not happen, but... */
-			pipe_count = RCU_TORTURE_PIPE_LEN;
-		}
-		++__get_cpu_var(rcu_torture_count)[pipe_count];
-		completed = cur_ops->completed() - completed;
-		if (completed > RCU_TORTURE_PIPE_LEN) {
-			/* Should not happen, but... */
-			completed = RCU_TORTURE_PIPE_LEN;
-		}
-		++__get_cpu_var(rcu_torture_batch)[completed];
-		preempt_enable();
-		cur_ops->readunlock(idx);
-		schedule();
-	} while (!kthread_should_stop() && !fullstop);
-	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
-	while (!kthread_should_stop())
-		schedule_timeout_uninterruptible(1);
-	return 0;
-}
-
-/*
- * Create an RCU-torture statistics message in the specified buffer.
- */
-static int
-rcu_torture_printk(char *page)
-{
-	int cnt = 0;
-	int cpu;
-	int i;
-	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
-	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
-
-	for_each_possible_cpu(cpu) {
-		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
-			pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
-			batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
-		}
-	}
-	for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
-		if (pipesummary[i] != 0)
-			break;
-	}
-	cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
-	cnt += sprintf(&page[cnt],
-		       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
-		       "rtmbe: %d",
-		       rcu_torture_current,
-		       rcu_torture_current_version,
-		       list_empty(&rcu_torture_freelist),
-		       atomic_read(&n_rcu_torture_alloc),
-		       atomic_read(&n_rcu_torture_alloc_fail),
-		       atomic_read(&n_rcu_torture_free),
-		       atomic_read(&n_rcu_torture_mberror));
-	if (atomic_read(&n_rcu_torture_mberror) != 0)
-		cnt += sprintf(&page[cnt], " !!!");
-	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
-	if (i > 1) {
-		cnt += sprintf(&page[cnt], "!!! ");
-		atomic_inc(&n_rcu_torture_error);
-	}
-	cnt += sprintf(&page[cnt], "Reader Pipe: ");
-	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
-		cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
-	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
-	cnt += sprintf(&page[cnt], "Reader Batch: ");
-	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
-		cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
-	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
-	cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
-	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
-		cnt += sprintf(&page[cnt], " %d",
-			       atomic_read(&rcu_torture_wcount[i]));
-	}
-	cnt += sprintf(&page[cnt], "\n");
-	if (cur_ops->stats)
-		cnt += cur_ops->stats(&page[cnt]);
-	return cnt;
-}
-
-/*
- * Print torture statistics.  Caller must ensure that there is only
- * one call to this function at a given time!!!  This is normally
- * accomplished by relying on the module system to only have one copy
- * of the module loaded, and then by giving the rcu_torture_stats
- * kthread full control (or the init/cleanup functions when rcu_torture_stats
- * thread is not running).
- */
-static void
-rcu_torture_stats_print(void)
-{
-	int cnt;
-
-	cnt = rcu_torture_printk(printk_buf);
-	printk(KERN_ALERT "%s", printk_buf);
-}
-
-/*
- * Periodically prints torture statistics, if periodic statistics printing
- * was specified via the stat_interval module parameter.
- *
- * No need to worry about fullstop here, since this one doesn't reference
- * volatile state or register callbacks.
- */
-static int
-rcu_torture_stats(void *arg)
-{
-	VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
-	do {
-		schedule_timeout_interruptible(stat_interval * HZ);
-		rcu_torture_stats_print();
-	} while (!kthread_should_stop());
-	VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
-	return 0;
-}
-
-static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */
-
-/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
- * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
- */
-static void rcu_torture_shuffle_tasks(void)
-{
-	cpumask_t tmp_mask;
-	int i;
-
-	cpus_setall(tmp_mask);
-	get_online_cpus();
-
-	/* No point in shuffling if there is only one online CPU (ex: UP) */
-	if (num_online_cpus() == 1) {
-		put_online_cpus();
-		return;
-	}
-
-	if (rcu_idle_cpu != -1)
-		cpu_clear(rcu_idle_cpu, tmp_mask);
-
-	set_cpus_allowed_ptr(current, &tmp_mask);
-
-	if (reader_tasks) {
-		for (i = 0; i < nrealreaders; i++)
-			if (reader_tasks[i])
-				set_cpus_allowed_ptr(reader_tasks[i],
-						     &tmp_mask);
-	}
-
-	if (fakewriter_tasks) {
-		for (i = 0; i < nfakewriters; i++)
-			if (fakewriter_tasks[i])
-				set_cpus_allowed_ptr(fakewriter_tasks[i],
-						     &tmp_mask);
-	}
-
-	if (writer_task)
-		set_cpus_allowed_ptr(writer_task, &tmp_mask);
-
-	if (stats_task)
-		set_cpus_allowed_ptr(stats_task, &tmp_mask);
-
-	if (rcu_idle_cpu == -1)
-		rcu_idle_cpu = num_online_cpus() - 1;
-	else
-		rcu_idle_cpu--;
-
-	put_online_cpus();
-}
-
-/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
- * system to become idle at a time and cut off its timer ticks. This is meant
- * to test the support for such tickless idle CPU in RCU.
- */
-static int
-rcu_torture_shuffle(void *arg)
-{
-	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
-	do {
-		schedule_timeout_interruptible(shuffle_interval * HZ);
-		rcu_torture_shuffle_tasks();
-	} while (!kthread_should_stop());
-	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
-	return 0;
-}
-
-static inline void
-rcu_torture_print_module_parms(char *tag)
-{
-	printk(KERN_ALERT "%s" TORTURE_FLAG
-		"--- %s: nreaders=%d nfakewriters=%d "
-		"stat_interval=%d verbose=%d test_no_idle_hz=%d "
-		"shuffle_interval = %d\n",
-		torture_type, tag, nrealreaders, nfakewriters,
-		stat_interval, verbose, test_no_idle_hz, shuffle_interval);
-}
-
-static void
-rcu_torture_cleanup(void)
-{
-	int i;
-
-	fullstop = 1;
-	if (shuffler_task) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
-		kthread_stop(shuffler_task);
-	}
-	shuffler_task = NULL;
-
-	if (writer_task) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
-		kthread_stop(writer_task);
-	}
-	writer_task = NULL;
-
-	if (reader_tasks) {
-		for (i = 0; i < nrealreaders; i++) {
-			if (reader_tasks[i]) {
-				VERBOSE_PRINTK_STRING(
-					"Stopping rcu_torture_reader task");
-				kthread_stop(reader_tasks[i]);
-			}
-			reader_tasks[i] = NULL;
-		}
-		kfree(reader_tasks);
-		reader_tasks = NULL;
-	}
-	rcu_torture_current = NULL;
-
-	if (fakewriter_tasks) {
-		for (i = 0; i < nfakewriters; i++) {
-			if (fakewriter_tasks[i]) {
-				VERBOSE_PRINTK_STRING(
-					"Stopping rcu_torture_fakewriter task");
-				kthread_stop(fakewriter_tasks[i]);
-			}
-			fakewriter_tasks[i] = NULL;
-		}
-		kfree(fakewriter_tasks);
-		fakewriter_tasks = NULL;
-	}
-
-	if (stats_task) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
-		kthread_stop(stats_task);
-	}
-	stats_task = NULL;
-
-	/* Wait for all RCU callbacks to fire.  */
-	rcu_barrier();
-
-	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
-
-	if (cur_ops->cleanup)
-		cur_ops->cleanup();
-	if (atomic_read(&n_rcu_torture_error))
-		rcu_torture_print_module_parms("End of test: FAILURE");
-	else
-		rcu_torture_print_module_parms("End of test: SUCCESS");
-}
-
-static int __init
-rcu_torture_init(void)
-{
-	int i;
-	int cpu;
-	int firsterr = 0;
-	static struct rcu_torture_ops *torture_ops[] =
-		{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
-		  &srcu_ops, &sched_ops, };
-
-	/* Process args and tell the world that the torturer is on the job. */
-	for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
-		cur_ops = torture_ops[i];
-		if (strcmp(torture_type, cur_ops->name) == 0)
-			break;
-	}
-	if (i == ARRAY_SIZE(torture_ops)) {
-		printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
-		       torture_type);
-		return (-EINVAL);
-	}
-	if (cur_ops->init)
-		cur_ops->init(); /* no "goto unwind" prior to this point!!! */
-
-	if (nreaders >= 0)
-		nrealreaders = nreaders;
-	else
-		nrealreaders = 2 * num_online_cpus();
-	rcu_torture_print_module_parms("Start of test");
-	fullstop = 0;
-
-	/* Set up the freelist. */
-
-	INIT_LIST_HEAD(&rcu_torture_freelist);
-	for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
-		rcu_tortures[i].rtort_mbtest = 0;
-		list_add_tail(&rcu_tortures[i].rtort_free,
-			      &rcu_torture_freelist);
-	}
-
-	/* Initialize the statistics so that each run gets its own numbers. */
-
-	rcu_torture_current = NULL;
-	rcu_torture_current_version = 0;
-	atomic_set(&n_rcu_torture_alloc, 0);
-	atomic_set(&n_rcu_torture_alloc_fail, 0);
-	atomic_set(&n_rcu_torture_free, 0);
-	atomic_set(&n_rcu_torture_mberror, 0);
-	atomic_set(&n_rcu_torture_error, 0);
-	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
-		atomic_set(&rcu_torture_wcount[i], 0);
-	for_each_possible_cpu(cpu) {
-		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
-			per_cpu(rcu_torture_count, cpu)[i] = 0;
-			per_cpu(rcu_torture_batch, cpu)[i] = 0;
-		}
-	}
-
-	/* Start up the kthreads. */
-
-	VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
-	writer_task = kthread_run(rcu_torture_writer, NULL,
-				  "rcu_torture_writer");
-	if (IS_ERR(writer_task)) {
-		firsterr = PTR_ERR(writer_task);
-		VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
-		writer_task = NULL;
-		goto unwind;
-	}
-	fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
-	                           GFP_KERNEL);
-	if (fakewriter_tasks == NULL) {
-		VERBOSE_PRINTK_ERRSTRING("out of memory");
-		firsterr = -ENOMEM;
-		goto unwind;
-	}
-	for (i = 0; i < nfakewriters; i++) {
-		VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
-		fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
-		                                  "rcu_torture_fakewriter");
-		if (IS_ERR(fakewriter_tasks[i])) {
-			firsterr = PTR_ERR(fakewriter_tasks[i]);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
-			fakewriter_tasks[i] = NULL;
-			goto unwind;
-		}
-	}
-	reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
-			       GFP_KERNEL);
-	if (reader_tasks == NULL) {
-		VERBOSE_PRINTK_ERRSTRING("out of memory");
-		firsterr = -ENOMEM;
-		goto unwind;
-	}
-	for (i = 0; i < nrealreaders; i++) {
-		VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
-		reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
-					      "rcu_torture_reader");
-		if (IS_ERR(reader_tasks[i])) {
-			firsterr = PTR_ERR(reader_tasks[i]);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
-			reader_tasks[i] = NULL;
-			goto unwind;
-		}
-	}
-	if (stat_interval > 0) {
-		VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
-		stats_task = kthread_run(rcu_torture_stats, NULL,
-					"rcu_torture_stats");
-		if (IS_ERR(stats_task)) {
-			firsterr = PTR_ERR(stats_task);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
-			stats_task = NULL;
-			goto unwind;
-		}
-	}
-	if (test_no_idle_hz) {
-		rcu_idle_cpu = num_online_cpus() - 1;
-		/* Create the shuffler thread */
-		shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
-					  "rcu_torture_shuffle");
-		if (IS_ERR(shuffler_task)) {
-			firsterr = PTR_ERR(shuffler_task);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
-			shuffler_task = NULL;
-			goto unwind;
-		}
-	}
-	return 0;
-
-unwind:
-	rcu_torture_cleanup();
-	return firsterr;
-}
-
-module_init(rcu_torture_init);
-module_exit(rcu_torture_cleanup);
diff --git a/kernel/relay.c b/kernel/relay.c
index bc24dcdc570f..7de644cdec43 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1191,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in,
 	ret = 0;
 	spliced = 0;
 
-	while (len) {
+	while (len && !spliced) {
 		ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
 		if (ret < 0)
 			break;
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
deleted file mode 100644
index 092e4c620af9..000000000000
--- a/kernel/rtmutex-tester.c
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * RT-Mutex-tester: scriptable tester for rt mutexes
- *
- * started by Thomas Gleixner:
- *
- *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- */
-#include <linux/kthread.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/smp_lock.h>
-#include <linux/spinlock.h>
-#include <linux/sysdev.h>
-#include <linux/timer.h>
-#include <linux/freezer.h>
-
-#include "rtmutex.h"
-
-#define MAX_RT_TEST_THREADS	8
-#define MAX_RT_TEST_MUTEXES	8
-
-static spinlock_t rttest_lock;
-static atomic_t rttest_event;
-
-struct test_thread_data {
-	int			opcode;
-	int			opdata;
-	int			mutexes[MAX_RT_TEST_MUTEXES];
-	int			bkl;
-	int			event;
-	struct sys_device	sysdev;
-};
-
-static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
-static struct task_struct *threads[MAX_RT_TEST_THREADS];
-static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
-
-enum test_opcodes {
-	RTTEST_NOP = 0,
-	RTTEST_SCHEDOT,		/* 1 Sched other, data = nice */
-	RTTEST_SCHEDRT,		/* 2 Sched fifo, data = prio */
-	RTTEST_LOCK,		/* 3 Lock uninterruptible, data = lockindex */
-	RTTEST_LOCKNOWAIT,	/* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
-	RTTEST_LOCKINT,		/* 5 Lock interruptible, data = lockindex */
-	RTTEST_LOCKINTNOWAIT,	/* 6 Lock interruptible no wait in wakeup, data = lockindex */
-	RTTEST_LOCKCONT,	/* 7 Continue locking after the wakeup delay */
-	RTTEST_UNLOCK,		/* 8 Unlock, data = lockindex */
-	RTTEST_LOCKBKL,		/* 9 Lock BKL */
-	RTTEST_UNLOCKBKL,	/* 10 Unlock BKL */
-	RTTEST_SIGNAL,		/* 11 Signal other test thread, data = thread id */
-	RTTEST_RESETEVENT = 98,	/* 98 Reset event counter */
-	RTTEST_RESET = 99,	/* 99 Reset all pending operations */
-};
-
-static int handle_op(struct test_thread_data *td, int lockwakeup)
-{
-	int i, id, ret = -EINVAL;
-
-	switch(td->opcode) {
-
-	case RTTEST_NOP:
-		return 0;
-
-	case RTTEST_LOCKCONT:
-		td->mutexes[td->opdata] = 1;
-		td->event = atomic_add_return(1, &rttest_event);
-		return 0;
-
-	case RTTEST_RESET:
-		for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
-			if (td->mutexes[i] == 4) {
-				rt_mutex_unlock(&mutexes[i]);
-				td->mutexes[i] = 0;
-			}
-		}
-
-		if (!lockwakeup && td->bkl == 4) {
-			unlock_kernel();
-			td->bkl = 0;
-		}
-		return 0;
-
-	case RTTEST_RESETEVENT:
-		atomic_set(&rttest_event, 0);
-		return 0;
-
-	default:
-		if (lockwakeup)
-			return ret;
-	}
-
-	switch(td->opcode) {
-
-	case RTTEST_LOCK:
-	case RTTEST_LOCKNOWAIT:
-		id = td->opdata;
-		if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
-			return ret;
-
-		td->mutexes[id] = 1;
-		td->event = atomic_add_return(1, &rttest_event);
-		rt_mutex_lock(&mutexes[id]);
-		td->event = atomic_add_return(1, &rttest_event);
-		td->mutexes[id] = 4;
-		return 0;
-
-	case RTTEST_LOCKINT:
-	case RTTEST_LOCKINTNOWAIT:
-		id = td->opdata;
-		if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
-			return ret;
-
-		td->mutexes[id] = 1;
-		td->event = atomic_add_return(1, &rttest_event);
-		ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
-		td->event = atomic_add_return(1, &rttest_event);
-		td->mutexes[id] = ret ? 0 : 4;
-		return ret ? -EINTR : 0;
-
-	case RTTEST_UNLOCK:
-		id = td->opdata;
-		if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
-			return ret;
-
-		td->event = atomic_add_return(1, &rttest_event);
-		rt_mutex_unlock(&mutexes[id]);
-		td->event = atomic_add_return(1, &rttest_event);
-		td->mutexes[id] = 0;
-		return 0;
-
-	case RTTEST_LOCKBKL:
-		if (td->bkl)
-			return 0;
-		td->bkl = 1;
-		lock_kernel();
-		td->bkl = 4;
-		return 0;
-
-	case RTTEST_UNLOCKBKL:
-		if (td->bkl != 4)
-			break;
-		unlock_kernel();
-		td->bkl = 0;
-		return 0;
-
-	default:
-		break;
-	}
-	return ret;
-}
-
-/*
- * Schedule replacement for rtsem_down(). Only called for threads with
- * PF_MUTEX_TESTER set.
- *
- * This allows us to have finegrained control over the event flow.
- *
- */
-void schedule_rt_mutex_test(struct rt_mutex *mutex)
-{
-	int tid, op, dat;
-	struct test_thread_data *td;
-
-	/* We have to lookup the task */
-	for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
-		if (threads[tid] == current)
-			break;
-	}
-
-	BUG_ON(tid == MAX_RT_TEST_THREADS);
-
-	td = &thread_data[tid];
-
-	op = td->opcode;
-	dat = td->opdata;
-
-	switch (op) {
-	case RTTEST_LOCK:
-	case RTTEST_LOCKINT:
-	case RTTEST_LOCKNOWAIT:
-	case RTTEST_LOCKINTNOWAIT:
-		if (mutex != &mutexes[dat])
-			break;
-
-		if (td->mutexes[dat] != 1)
-			break;
-
-		td->mutexes[dat] = 2;
-		td->event = atomic_add_return(1, &rttest_event);
-		break;
-
-	case RTTEST_LOCKBKL:
-	default:
-		break;
-	}
-
-	schedule();
-
-
-	switch (op) {
-	case RTTEST_LOCK:
-	case RTTEST_LOCKINT:
-		if (mutex != &mutexes[dat])
-			return;
-
-		if (td->mutexes[dat] != 2)
-			return;
-
-		td->mutexes[dat] = 3;
-		td->event = atomic_add_return(1, &rttest_event);
-		break;
-
-	case RTTEST_LOCKNOWAIT:
-	case RTTEST_LOCKINTNOWAIT:
-		if (mutex != &mutexes[dat])
-			return;
-
-		if (td->mutexes[dat] != 2)
-			return;
-
-		td->mutexes[dat] = 1;
-		td->event = atomic_add_return(1, &rttest_event);
-		return;
-
-	case RTTEST_LOCKBKL:
-		return;
-	default:
-		return;
-	}
-
-	td->opcode = 0;
-
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		if (td->opcode > 0) {
-			int ret;
-
-			set_current_state(TASK_RUNNING);
-			ret = handle_op(td, 1);
-			set_current_state(TASK_INTERRUPTIBLE);
-			if (td->opcode == RTTEST_LOCKCONT)
-				break;
-			td->opcode = ret;
-		}
-
-		/* Wait for the next command to be executed */
-		schedule();
-	}
-
-	/* Restore previous command and data */
-	td->opcode = op;
-	td->opdata = dat;
-}
-
-static int test_func(void *data)
-{
-	struct test_thread_data *td = data;
-	int ret;
-
-	current->flags |= PF_MUTEX_TESTER;
-	set_freezable();
-	allow_signal(SIGHUP);
-
-	for(;;) {
-
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		if (td->opcode > 0) {
-			set_current_state(TASK_RUNNING);
-			ret = handle_op(td, 0);
-			set_current_state(TASK_INTERRUPTIBLE);
-			td->opcode = ret;
-		}
-
-		/* Wait for the next command to be executed */
-		schedule();
-		try_to_freeze();
-
-		if (signal_pending(current))
-			flush_signals(current);
-
-		if(kthread_should_stop())
-			break;
-	}
-	return 0;
-}
-
-/**
- * sysfs_test_command - interface for test commands
- * @dev:	thread reference
- * @buf:	command for actual step
- * @count:	length of buffer
- *
- * command syntax:
- *
- * opcode:data
- */
-static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
-				  size_t count)
-{
-	struct sched_param schedpar;
-	struct test_thread_data *td;
-	char cmdbuf[32];
-	int op, dat, tid, ret;
-
-	td = container_of(dev, struct test_thread_data, sysdev);
-	tid = td->sysdev.id;
-
-	/* strings from sysfs write are not 0 terminated! */
-	if (count >= sizeof(cmdbuf))
-		return -EINVAL;
-
-	/* strip of \n: */
-	if (buf[count-1] == '\n')
-		count--;
-	if (count < 1)
-		return -EINVAL;
-
-	memcpy(cmdbuf, buf, count);
-	cmdbuf[count] = 0;
-
-	if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
-		return -EINVAL;
-
-	switch (op) {
-	case RTTEST_SCHEDOT:
-		schedpar.sched_priority = 0;
-		ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
-		if (ret)
-			return ret;
-		set_user_nice(current, 0);
-		break;
-
-	case RTTEST_SCHEDRT:
-		schedpar.sched_priority = dat;
-		ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
-		if (ret)
-			return ret;
-		break;
-
-	case RTTEST_SIGNAL:
-		send_sig(SIGHUP, threads[tid], 0);
-		break;
-
-	default:
-		if (td->opcode > 0)
-			return -EBUSY;
-		td->opdata = dat;
-		td->opcode = op;
-		wake_up_process(threads[tid]);
-	}
-
-	return count;
-}
-
-/**
- * sysfs_test_status - sysfs interface for rt tester
- * @dev:	thread to query
- * @buf:	char buffer to be filled with thread status info
- */
-static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
-{
-	struct test_thread_data *td;
-	struct task_struct *tsk;
-	char *curr = buf;
-	int i;
-
-	td = container_of(dev, struct test_thread_data, sysdev);
-	tsk = threads[td->sysdev.id];
-
-	spin_lock(&rttest_lock);
-
-	curr += sprintf(curr,
-		"O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
-		td->opcode, td->event, tsk->state,
-			(MAX_RT_PRIO - 1) - tsk->prio,
-			(MAX_RT_PRIO - 1) - tsk->normal_prio,
-		tsk->pi_blocked_on, td->bkl);
-
-	for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
-		curr += sprintf(curr, "%d", td->mutexes[i]);
-
-	spin_unlock(&rttest_lock);
-
-	curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
-			mutexes[td->sysdev.id].owner);
-
-	return curr - buf;
-}
-
-static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
-static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
-
-static struct sysdev_class rttest_sysclass = {
-	.name = "rttest",
-};
-
-static int init_test_thread(int id)
-{
-	thread_data[id].sysdev.cls = &rttest_sysclass;
-	thread_data[id].sysdev.id = id;
-
-	threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
-	if (IS_ERR(threads[id]))
-		return PTR_ERR(threads[id]);
-
-	return sysdev_register(&thread_data[id].sysdev);
-}
-
-static int init_rttest(void)
-{
-	int ret, i;
-
-	spin_lock_init(&rttest_lock);
-
-	for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
-		rt_mutex_init(&mutexes[i]);
-
-	ret = sysdev_class_register(&rttest_sysclass);
-	if (ret)
-		return ret;
-
-	for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
-		ret = init_test_thread(i);
-		if (ret)
-			break;
-		ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
-		if (ret)
-			break;
-		ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
-		if (ret)
-			break;
-	}
-
-	printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
-
-	return ret;
-}
-
-device_initcall(init_rttest);
diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a91539..fc9ba904d5b8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,6 +70,7 @@
 #include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
+#include <linux/ftrace.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -136,7 +137,7 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 
 static inline int rt_policy(int policy)
 {
-	if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
+	if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
 		return 1;
 	return 0;
 }
@@ -398,43 +399,6 @@ struct cfs_rq {
 	 */
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
-
-#ifdef CONFIG_SMP
-	unsigned long task_weight;
-	unsigned long shares;
-	/*
-	 * We need space to build a sched_domain wide view of the full task
-	 * group tree, in order to avoid depending on dynamic memory allocation
-	 * during the load balancing we place this in the per cpu task group
-	 * hierarchy. This limits the load balancing to one instance per cpu,
-	 * but more should not be needed anyway.
-	 */
-	struct aggregate_struct {
-		/*
-		 *   load = weight(cpus) * f(tg)
-		 *
-		 * Where f(tg) is the recursive weight fraction assigned to
-		 * this group.
-		 */
-		unsigned long load;
-
-		/*
-		 * part of the group weight distributed to this span.
-		 */
-		unsigned long shares;
-
-		/*
-		 * The sum of all runqueue weights within this span.
-		 */
-		unsigned long rq_weight;
-
-		/*
-		 * Weight contributed by tasks; this is the part we can
-		 * influence by moving tasks around.
-		 */
-		unsigned long task_weight;
-	} aggregate;
-#endif
 #endif
 };
 
@@ -632,6 +596,8 @@ static inline void update_rq_clock(struct rq *rq)
 	rq->clock = sched_clock_cpu(cpu_of(rq));
 }
 
+#include "sched_trace.h"
+
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -641,6 +607,24 @@ static inline void update_rq_clock(struct rq *rq)
 # define const_debug static const
 #endif
 
+/**
+ * runqueue_is_locked
+ *
+ * Returns true if the current cpu runqueue is locked.
+ * This interface allows printk to be called with the runqueue lock
+ * held and know whether or not it is OK to wake up the klogd.
+ */
+int runqueue_is_locked(void)
+{
+	int cpu = get_cpu();
+	struct rq *rq = cpu_rq(cpu);
+	int ret;
+
+	ret = spin_is_locked(&rq->lock);
+	put_cpu();
+	return ret;
+}
+
 /*
  * Debugging: various feature bits
  */
@@ -865,12 +849,12 @@ static unsigned long long __cpu_clock(int cpu)
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
  */
-unsigned long long cpu_clock(int cpu)
+unsigned long long notrace cpu_clock(int cpu)
 {
 	unsigned long long prev_cpu_time, time, delta_time;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	prev_cpu_time = per_cpu(prev_cpu_time, cpu);
 	time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
 	delta_time = time-prev_cpu_time;
@@ -879,7 +863,7 @@ unsigned long long cpu_clock(int cpu)
 		time = __sync_cpu_clock(time, cpu);
 		per_cpu(prev_cpu_time, cpu) = time;
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return time;
 }
@@ -1368,9 +1352,6 @@ static void __resched_task(struct task_struct *p, int tif_bit)
  */
 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 
-/*
- * delta *= weight / lw
- */
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
@@ -1393,6 +1374,12 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 
+static inline unsigned long
+calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
+{
+	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
+}
+
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
@@ -1505,326 +1492,6 @@ static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static unsigned long cpu_avg_load_per_task(int cpu);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/*
- * Group load balancing.
- *
- * We calculate a few balance domain wide aggregate numbers; load and weight.
- * Given the pictures below, and assuming each item has equal weight:
- *
- *         root          1 - thread
- *         / | \         A - group
- *        A  1  B
- *       /|\   / \
- *      C 2 D 3   4
- *      |   |
- *      5   6
- *
- * load:
- *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
- *    which equals 1/9-th of the total load.
- *
- * shares:
- *    The weight of this group on the selected cpus.
- *
- * rq_weight:
- *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
- *    B would get 2.
- *
- * task_weight:
- *    Part of the rq_weight contributed by tasks; all groups except B would
- *    get 1, B gets 2.
- */
-
-static inline struct aggregate_struct *
-aggregate(struct task_group *tg, struct sched_domain *sd)
-{
-	return &tg->cfs_rq[sd->first_cpu]->aggregate;
-}
-
-typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
-
-/*
- * Iterate the full tree, calling @down when first entering a node and @up when
- * leaving it for the final time.
- */
-static
-void aggregate_walk_tree(aggregate_func down, aggregate_func up,
-			 struct sched_domain *sd)
-{
-	struct task_group *parent, *child;
-
-	rcu_read_lock();
-	parent = &root_task_group;
-down:
-	(*down)(parent, sd);
-	list_for_each_entry_rcu(child, &parent->children, siblings) {
-		parent = child;
-		goto down;
-
-up:
-		continue;
-	}
-	(*up)(parent, sd);
-
-	child = parent;
-	parent = parent->parent;
-	if (parent)
-		goto up;
-	rcu_read_unlock();
-}
-
-/*
- * Calculate the aggregate runqueue weight.
- */
-static
-void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
-{
-	unsigned long rq_weight = 0;
-	unsigned long task_weight = 0;
-	int i;
-
-	for_each_cpu_mask(i, sd->span) {
-		rq_weight += tg->cfs_rq[i]->load.weight;
-		task_weight += tg->cfs_rq[i]->task_weight;
-	}
-
-	aggregate(tg, sd)->rq_weight = rq_weight;
-	aggregate(tg, sd)->task_weight = task_weight;
-}
-
-/*
- * Compute the weight of this group on the given cpus.
- */
-static
-void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
-{
-	unsigned long shares = 0;
-	int i;
-
-	for_each_cpu_mask(i, sd->span)
-		shares += tg->cfs_rq[i]->shares;
-
-	if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
-		shares = tg->shares;
-
-	aggregate(tg, sd)->shares = shares;
-}
-
-/*
- * Compute the load fraction assigned to this group, relies on the aggregate
- * weight and this group's parent's load, i.e. top-down.
- */
-static
-void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
-{
-	unsigned long load;
-
-	if (!tg->parent) {
-		int i;
-
-		load = 0;
-		for_each_cpu_mask(i, sd->span)
-			load += cpu_rq(i)->load.weight;
-
-	} else {
-		load = aggregate(tg->parent, sd)->load;
-
-		/*
-		 * shares is our weight in the parent's rq so
-		 * shares/parent->rq_weight gives our fraction of the load
-		 */
-		load *= aggregate(tg, sd)->shares;
-		load /= aggregate(tg->parent, sd)->rq_weight + 1;
-	}
-
-	aggregate(tg, sd)->load = load;
-}
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void
-__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
-			  int tcpu)
-{
-	int boost = 0;
-	unsigned long shares;
-	unsigned long rq_weight;
-
-	if (!tg->se[tcpu])
-		return;
-
-	rq_weight = tg->cfs_rq[tcpu]->load.weight;
-
-	/*
-	 * If there are currently no tasks on the cpu pretend there is one of
-	 * average load so that when a new task gets to run here it will not
-	 * get delayed by group starvation.
-	 */
-	if (!rq_weight) {
-		boost = 1;
-		rq_weight = NICE_0_LOAD;
-	}
-
-	/*
-	 *           \Sum shares * rq_weight
-	 * shares =  -----------------------
-	 *               \Sum rq_weight
-	 *
-	 */
-	shares = aggregate(tg, sd)->shares * rq_weight;
-	shares /= aggregate(tg, sd)->rq_weight + 1;
-
-	/*
-	 * record the actual number of shares, not the boosted amount.
-	 */
-	tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
-
-	if (shares < MIN_SHARES)
-		shares = MIN_SHARES;
-	else if (shares > MAX_SHARES)
-		shares = MAX_SHARES;
-
-	__set_se_shares(tg->se[tcpu], shares);
-}
-
-/*
- * Re-adjust the weights on the cpu the task came from and on the cpu the
- * task went to.
- */
-static void
-__move_group_shares(struct task_group *tg, struct sched_domain *sd,
-		    int scpu, int dcpu)
-{
-	unsigned long shares;
-
-	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
-
-	__update_group_shares_cpu(tg, sd, scpu);
-	__update_group_shares_cpu(tg, sd, dcpu);
-
-	/*
-	 * ensure we never loose shares due to rounding errors in the
-	 * above redistribution.
-	 */
-	shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
-	if (shares)
-		tg->cfs_rq[dcpu]->shares += shares;
-}
-
-/*
- * Because changing a group's shares changes the weight of the super-group
- * we need to walk up the tree and change all shares until we hit the root.
- */
-static void
-move_group_shares(struct task_group *tg, struct sched_domain *sd,
-		  int scpu, int dcpu)
-{
-	while (tg) {
-		__move_group_shares(tg, sd, scpu, dcpu);
-		tg = tg->parent;
-	}
-}
-
-static
-void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
-{
-	unsigned long shares = aggregate(tg, sd)->shares;
-	int i;
-
-	for_each_cpu_mask(i, sd->span) {
-		struct rq *rq = cpu_rq(i);
-		unsigned long flags;
-
-		spin_lock_irqsave(&rq->lock, flags);
-		__update_group_shares_cpu(tg, sd, i);
-		spin_unlock_irqrestore(&rq->lock, flags);
-	}
-
-	aggregate_group_shares(tg, sd);
-
-	/*
-	 * ensure we never loose shares due to rounding errors in the
-	 * above redistribution.
-	 */
-	shares -= aggregate(tg, sd)->shares;
-	if (shares) {
-		tg->cfs_rq[sd->first_cpu]->shares += shares;
-		aggregate(tg, sd)->shares += shares;
-	}
-}
-
-/*
- * Calculate the accumulative weight and recursive load of each task group
- * while walking down the tree.
- */
-static
-void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
-{
-	aggregate_group_weight(tg, sd);
-	aggregate_group_shares(tg, sd);
-	aggregate_group_load(tg, sd);
-}
-
-/*
- * Rebalance the cpu shares while walking back up the tree.
- */
-static
-void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
-{
-	aggregate_group_set_shares(tg, sd);
-}
-
-static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
-
-static void __init init_aggregate(void)
-{
-	int i;
-
-	for_each_possible_cpu(i)
-		spin_lock_init(&per_cpu(aggregate_lock, i));
-}
-
-static int get_aggregate(struct sched_domain *sd)
-{
-	if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
-		return 0;
-
-	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
-	return 1;
-}
-
-static void put_aggregate(struct sched_domain *sd)
-{
-	spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
-}
-
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-	cfs_rq->shares = shares;
-}
-
-#else
-
-static inline void init_aggregate(void)
-{
-}
-
-static inline int get_aggregate(struct sched_domain *sd)
-{
-	return 0;
-}
-
-static inline void put_aggregate(struct sched_domain *sd)
-{
-}
-#endif
-
 #else /* CONFIG_SMP */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1845,14 +1512,26 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 
 #define sched_class_highest (&rt_sched_class)
 
-static void inc_nr_running(struct rq *rq)
+static inline void inc_load(struct rq *rq, const struct task_struct *p)
+{
+	update_load_add(&rq->load, p->se.load.weight);
+}
+
+static inline void dec_load(struct rq *rq, const struct task_struct *p)
+{
+	update_load_sub(&rq->load, p->se.load.weight);
+}
+
+static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running++;
+	inc_load(rq, p);
 }
 
-static void dec_nr_running(struct rq *rq)
+static void dec_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running--;
+	dec_load(rq, p);
 }
 
 static void set_load_weight(struct task_struct *p)
@@ -1944,7 +1623,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, wakeup);
-	inc_nr_running(rq);
+	inc_nr_running(p, rq);
 }
 
 /*
@@ -1956,7 +1635,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 		rq->nr_uninterruptible++;
 
 	dequeue_task(rq, p, sleep);
-	dec_nr_running(rq);
+	dec_nr_running(p, rq);
 }
 
 /**
@@ -2138,6 +1817,7 @@ void wait_task_inactive(struct task_struct *p)
 		 * just go back and repeat.
 		 */
 		rq = task_rq_lock(p, &flags);
+		trace_kernel_sched_wait(p);
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
 		task_rq_unlock(rq, &flags);
@@ -2481,6 +2161,7 @@ out_activate:
 	success = 1;
 
 out_running:
+	trace_kernel_sched_wakeup(rq, p);
 	check_preempt_curr(rq, p);
 
 	p->state = TASK_RUNNING;
@@ -2609,8 +2290,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		 * management (if any):
 		 */
 		p->sched_class->task_new(rq, p);
-		inc_nr_running(rq);
+		inc_nr_running(p, rq);
 	}
+	trace_kernel_sched_wakeup_new(rq, p);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
@@ -2783,6 +2465,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
+
+	trace_kernel_sched_switch(rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
@@ -3015,6 +2699,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 	    || unlikely(cpu_is_offline(dest_cpu)))
 		goto out;
 
+	trace_kernel_sched_migrate_task(p, cpu_of(rq), dest_cpu);
 	/* force the process onto the specified CPU */
 	if (migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
@@ -3600,12 +3285,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
-	int unlock_aggregate;
 
 	cpus_setall(*cpus);
 
-	unlock_aggregate = get_aggregate(sd);
-
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3721,9 +3403,8 @@ redo:
 
 	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		ld_moved = -1;
-
-	goto out;
+		return -1;
+	return ld_moved;
 
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
@@ -3738,13 +3419,8 @@ out_one_pinned:
 
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		ld_moved = -1;
-	else
-		ld_moved = 0;
-out:
-	if (unlock_aggregate)
-		put_aggregate(sd);
-	return ld_moved;
+		return -1;
+	return 0;
 }
 
 /*
@@ -4362,26 +4038,44 @@ void scheduler_tick(void)
 #endif
 }
 
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+
+static inline unsigned long get_parent_ip(unsigned long addr)
+{
+	if (in_lock_functions(addr)) {
+		addr = CALLER_ADDR2;
+		if (in_lock_functions(addr))
+			addr = CALLER_ADDR3;
+	}
+	return addr;
+}
 
 void __kprobes add_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
+#endif
 	preempt_count() += val;
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
+#endif
+	if (preempt_count() == val)
+		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 EXPORT_SYMBOL(add_preempt_count);
 
 void __kprobes sub_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
@@ -4393,7 +4087,10 @@ void __kprobes sub_preempt_count(int val)
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
+#endif
 
+	if (preempt_count() == val)
+		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 	preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
@@ -4430,7 +4127,7 @@ static inline void schedule_debug(struct task_struct *prev)
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
-	if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
+	if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
 		__schedule_bug(prev);
 
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4931,8 +4628,10 @@ void set_user_nice(struct task_struct *p, long nice)
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
-	if (on_rq)
+	if (on_rq) {
 		dequeue_task(rq, p, 0);
+		dec_load(rq, p);
+	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -4942,6 +4641,7 @@ void set_user_nice(struct task_struct *p, long nice)
 
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
+		inc_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -5726,7 +5426,7 @@ out_unlock:
 	return retval;
 }
 
-static const char stat_nam[] = "RSDTtZX";
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 
 void sched_show_task(struct task_struct *p)
 {
@@ -5748,12 +5448,7 @@ void sched_show_task(struct task_struct *p)
 		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
-	{
-		unsigned long *n = end_of_stack(p);
-		while (!*n)
-			n++;
-		free = (unsigned long)n - (unsigned long)end_of_stack(p);
-	}
+	free = stack_not_used(p);
 #endif
 	printk(KERN_CONT "%5lu %5d %6d\n", free,
 		task_pid_nr(p), task_pid_nr(p->real_parent));
@@ -7316,7 +7011,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
 			sd->span = *cpu_map;
-			sd->first_cpu = first_cpu(sd->span);
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
 			sd_allnodes = 1;
@@ -7327,7 +7021,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, NODE);
 		set_domain_attribute(sd, attr);
 		sched_domain_node_span(cpu_to_node(i), &sd->span);
-		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7339,7 +7032,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
 		sd->span = *nodemask;
-		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7351,7 +7043,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
 		sd->span = cpu_coregroup_map(i);
-		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7364,7 +7055,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		sd->span = per_cpu(cpu_sibling_map, i);
-		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7568,8 +7258,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 
 static cpumask_t *doms_cur;	/* current sched domains */
 static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
-static struct sched_domain_attr *dattr_cur;	/* attribues of custom domains
-						   in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+				/* attribues of custom domains in 'doms_cur' */
 
 /*
  * Special case: If a kmalloc of a doms_cur partition (array of
@@ -8034,7 +7724,6 @@ void __init sched_init(void)
 	}
 
 #ifdef CONFIG_SMP
-	init_aggregate();
 	init_defrootdomain();
 #endif
 
@@ -8599,11 +8288,14 @@ void sched_move_task(struct task_struct *tsk)
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
+	struct rq *rq = cfs_rq->rq;
 	int on_rq;
 
+	spin_lock_irq(&rq->lock);
+
 	on_rq = se->on_rq;
 	if (on_rq)
 		dequeue_entity(cfs_rq, se, 0);
@@ -8613,17 +8305,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
 
 	if (on_rq)
 		enqueue_entity(cfs_rq, se, 0);
-}
-
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-	struct cfs_rq *cfs_rq = se->cfs_rq;
-	struct rq *rq = cfs_rq->rq;
-	unsigned long flags;
 
-	spin_lock_irqsave(&rq->lock, flags);
-	__set_se_shares(se, shares);
-	spin_unlock_irqrestore(&rq->lock, flags);
+	spin_unlock_irq(&rq->lock);
 }
 
 static DEFINE_MUTEX(shares_mutex);
@@ -8662,13 +8345,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	 * w/o tripping rebalance_share or load_balance_fair.
 	 */
 	tg->shares = shares;
-	for_each_possible_cpu(i) {
-		/*
-		 * force a rebalance
-		 */
-		cfs_rq_set_shares(tg->cfs_rq[i], 0);
+	for_each_possible_cpu(i)
 		set_se_shares(tg->se[i], shares);
-	}
 
 	/*
 	 * Enable load balance activity on this group, by inserting it back on
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 9c597e37f7de..ce05271219ab 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -59,22 +59,26 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
 	return &per_cpu(sched_clock_data, cpu);
 }
 
+static __read_mostly int sched_clock_running;
+
 void sched_clock_init(void)
 {
 	u64 ktime_now = ktime_to_ns(ktime_get());
-	u64 now = 0;
+	unsigned long now_jiffies = jiffies;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
 		struct sched_clock_data *scd = cpu_sdc(cpu);
 
 		scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
-		scd->prev_jiffies = jiffies;
-		scd->prev_raw = now;
-		scd->tick_raw = now;
+		scd->prev_jiffies = now_jiffies;
+		scd->prev_raw = 0;
+		scd->tick_raw = 0;
 		scd->tick_gtod = ktime_now;
 		scd->clock = ktime_now;
 	}
+
+	sched_clock_running = 1;
 }
 
 /*
@@ -136,6 +140,9 @@ u64 sched_clock_cpu(int cpu)
 	struct sched_clock_data *scd = cpu_sdc(cpu);
 	u64 now, clock;
 
+	if (unlikely(!sched_clock_running))
+		return 0ull;
+
 	WARN_ON_ONCE(!irqs_disabled());
 	now = sched_clock();
 
@@ -174,6 +181,9 @@ void sched_clock_tick(void)
 	struct sched_clock_data *scd = this_scd();
 	u64 now, now_gtod;
 
+	if (unlikely(!sched_clock_running))
+		return;
+
 	WARN_ON_ONCE(!irqs_disabled());
 
 	now = sched_clock();
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5f06118fbc31..8bb713040ac9 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -167,11 +167,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #endif
 	SEQ_printf(m, "  .%-30s: %ld\n", "nr_spread_over",
 			cfs_rq->nr_spread_over);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
-	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
-#endif
-#endif
 }
 
 static void print_cpu(struct seq_file *m, int cpu)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e24ecd39c4b8..08ae848b71d4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -334,34 +334,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 #endif
 
 /*
- * delta *= w / rw
- */
-static inline unsigned long
-calc_delta_weight(unsigned long delta, struct sched_entity *se)
-{
-	for_each_sched_entity(se) {
-		delta = calc_delta_mine(delta,
-				se->load.weight, &cfs_rq_of(se)->load);
-	}
-
-	return delta;
-}
-
-/*
- * delta *= rw / w
- */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
-{
-	for_each_sched_entity(se) {
-		delta = calc_delta_mine(delta,
-				cfs_rq_of(se)->load.weight, &se->load);
-	}
-
-	return delta;
-}
-
-/*
  * The idea is to set a period in which each task runs once.
  *
  * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -390,54 +362,47 @@ static u64 __sched_period(unsigned long nr_running)
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
+	u64 slice = __sched_period(cfs_rq->nr_running);
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		slice *= se->load.weight;
+		do_div(slice, cfs_rq->load.weight);
+	}
+
+
+	return slice;
 }
 
 /*
  * We calculate the vruntime slice of a to be inserted task
  *
- * vs = s*rw/w = p
+ * vs = s/w = p/rw
  */
 static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned long nr_running = cfs_rq->nr_running;
+	unsigned long weight;
+	u64 vslice;
 
 	if (!se->on_rq)
 		nr_running++;
 
-	return __sched_period(nr_running);
-}
-
-/*
- * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
- * that it favours >=0 over <0.
- *
- *   -20         |
- *               |
- *     0 --------+-------
- *             .'
- *    19     .'
- *
- */
-static unsigned long
-calc_delta_asym(unsigned long delta, struct sched_entity *se)
-{
-	struct load_weight lw = {
-		.weight = NICE_0_LOAD,
-		.inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
-	};
+	vslice = __sched_period(nr_running);
 
 	for_each_sched_entity(se) {
-		struct load_weight *se_lw = &se->load;
+		cfs_rq = cfs_rq_of(se);
 
-		if (se->load.weight < NICE_0_LOAD)
-			se_lw = &lw;
+		weight = cfs_rq->load.weight;
+		if (!se->on_rq)
+			weight += se->load.weight;
 
-		delta = calc_delta_mine(delta,
-				cfs_rq_of(se)->load.weight, se_lw);
+		vslice *= NICE_0_LOAD;
+		do_div(vslice, weight);
 	}
 
-	return delta;
+	return vslice;
 }
 
 /*
@@ -454,7 +419,11 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
 	curr->sum_exec_runtime += delta_exec;
 	schedstat_add(cfs_rq, exec_clock, delta_exec);
-	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
+	delta_exec_weighted = delta_exec;
+	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
+		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
+							&curr->load);
+	}
 	curr->vruntime += delta_exec_weighted;
 }
 
@@ -541,27 +510,10 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-	cfs_rq->task_weight += weight;
-}
-#else
-static inline void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-}
-#endif
-
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
-	if (!parent_entity(se))
-		inc_cpu_load(rq_of(cfs_rq), se->load.weight);
-	if (entity_is_task(se))
-		add_cfs_task_weight(cfs_rq, se->load.weight);
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
 	list_add(&se->group_node, &cfs_rq->tasks);
@@ -571,10 +523,6 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_sub(&cfs_rq->load, se->load.weight);
-	if (!parent_entity(se))
-		dec_cpu_load(rq_of(cfs_rq), se->load.weight);
-	if (entity_is_task(se))
-		add_cfs_task_weight(cfs_rq, -se->load.weight);
 	cfs_rq->nr_running--;
 	se->on_rq = 0;
 	list_del_init(&se->group_node);
@@ -661,17 +609,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
-		if (sched_feat(NEW_FAIR_SLEEPERS)) {
-			unsigned long thresh = sysctl_sched_latency;
-
-			/*
-			 * convert the sleeper threshold into virtual time
-			 */
-			if (sched_feat(NORMALIZED_SLEEPER))
-				thresh = calc_delta_fair(thresh, se);
-
-			vruntime -= thresh;
-		}
+		if (sched_feat(NEW_FAIR_SLEEPERS))
+			vruntime -= sysctl_sched_latency;
 
 		/* ensure we never gain time by being placed backwards. */
 		vruntime = max_vruntime(se->vruntime, vruntime);
@@ -1057,16 +996,27 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	struct task_struct *curr = this_rq->curr;
 	unsigned long tl = this_load;
 	unsigned long tl_per_task;
+	int balanced;
 
-	if (!(this_sd->flags & SD_WAKE_AFFINE))
+	if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
 		return 0;
 
 	/*
+	 * If sync wakeup then subtract the (maximum possible)
+	 * effect of the currently running task from the load
+	 * of the current CPU:
+	 */
+	if (sync)
+		tl -= current->se.load.weight;
+
+	balanced = 100*(tl + p->se.load.weight) <= imbalance*load;
+
+	/*
 	 * If the currently running task will sleep within
 	 * a reasonable amount of time then attract this newly
 	 * woken task:
 	 */
-	if (sync && curr->sched_class == &fair_sched_class) {
+	if (sync && balanced && curr->sched_class == &fair_sched_class) {
 		if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
 				p->se.avg_overlap < sysctl_sched_migration_cost)
 			return 1;
@@ -1075,16 +1025,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	schedstat_inc(p, se.nr_wakeups_affine_attempts);
 	tl_per_task = cpu_avg_load_per_task(this_cpu);
 
-	/*
-	 * If sync wakeup then subtract the (maximum possible)
-	 * effect of the currently running task from the load
-	 * of the current CPU:
-	 */
-	if (sync)
-		tl -= current->se.load.weight;
-
 	if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
-			100*(tl + p->se.load.weight) <= imbalance*load) {
+			balanced) {
 		/*
 		 * This domain has SD_WAKE_AFFINE and
 		 * p is cache cold in this domain, and
@@ -1169,10 +1111,11 @@ static unsigned long wakeup_gran(struct sched_entity *se)
 	unsigned long gran = sysctl_sched_wakeup_granularity;
 
 	/*
-	 * More easily preempt - nice tasks, while not making it harder for
-	 * + nice tasks.
+	 * More easily preempt - nice tasks, while not making
+	 * it harder for + nice tasks.
 	 */
-	gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+	if (unlikely(se->load.weight > NICE_0_LOAD))
+		gran = calc_delta_fair(gran, &se->load);
 
 	return gran;
 }
@@ -1366,90 +1309,75 @@ static struct task_struct *load_balance_next_fair(void *arg)
 	return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
 }
 
-static unsigned long
-__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		unsigned long max_load_move, struct sched_domain *sd,
-		enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
-		struct cfs_rq *cfs_rq)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
 {
-	struct rq_iterator cfs_rq_iterator;
+	struct sched_entity *curr;
+	struct task_struct *p;
 
-	cfs_rq_iterator.start = load_balance_start_fair;
-	cfs_rq_iterator.next = load_balance_next_fair;
-	cfs_rq_iterator.arg = cfs_rq;
+	if (!cfs_rq->nr_running || !first_fair(cfs_rq))
+		return MAX_PRIO;
+
+	curr = cfs_rq->curr;
+	if (!curr)
+		curr = __pick_next_entity(cfs_rq);
+
+	p = task_of(curr);
 
-	return balance_tasks(this_rq, this_cpu, busiest,
-			max_load_move, sd, idle, all_pinned,
-			this_best_prio, &cfs_rq_iterator);
+	return p->prio;
 }
+#endif
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
 		  int *all_pinned, int *this_best_prio)
 {
+	struct cfs_rq *busy_cfs_rq;
 	long rem_load_move = max_load_move;
-	int busiest_cpu = cpu_of(busiest);
-	struct task_group *tg;
-
-	rcu_read_lock();
-	list_for_each_entry(tg, &task_groups, list) {
-		long imbalance;
-		unsigned long this_weight, busiest_weight;
-		long rem_load, max_load, moved_load;
-
-		/*
-		 * empty group
-		 */
-		if (!aggregate(tg, sd)->task_weight)
-			continue;
-
-		rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
-		rem_load /= aggregate(tg, sd)->load + 1;
-
-		this_weight = tg->cfs_rq[this_cpu]->task_weight;
-		busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
+	struct rq_iterator cfs_rq_iterator;
 
-		imbalance = (busiest_weight - this_weight) / 2;
+	cfs_rq_iterator.start = load_balance_start_fair;
+	cfs_rq_iterator.next = load_balance_next_fair;
 
-		if (imbalance < 0)
-			imbalance = busiest_weight;
+	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		struct cfs_rq *this_cfs_rq;
+		long imbalance;
+		unsigned long maxload;
 
-		max_load = max(rem_load, imbalance);
-		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
-				max_load, sd, idle, all_pinned, this_best_prio,
-				tg->cfs_rq[busiest_cpu]);
+		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
 
-		if (!moved_load)
+		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
+		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+		if (imbalance <= 0)
 			continue;
 
-		move_group_shares(tg, sd, busiest_cpu, this_cpu);
+		/* Don't pull more than imbalance/2 */
+		imbalance /= 2;
+		maxload = min(rem_load_move, imbalance);
 
-		moved_load *= aggregate(tg, sd)->load;
-		moved_load /= aggregate(tg, sd)->rq_weight + 1;
+		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+#else
+# define maxload rem_load_move
+#endif
+		/*
+		 * pass busy_cfs_rq argument into
+		 * load_balance_[start|next]_fair iterators
+		 */
+		cfs_rq_iterator.arg = busy_cfs_rq;
+		rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
+					       maxload, sd, idle, all_pinned,
+					       this_best_prio,
+					       &cfs_rq_iterator);
 
-		rem_load_move -= moved_load;
-		if (rem_load_move < 0)
+		if (rem_load_move <= 0)
 			break;
 	}
-	rcu_read_unlock();
 
 	return max_load_move - rem_load_move;
 }
-#else
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		  unsigned long max_load_move,
-		  struct sched_domain *sd, enum cpu_idle_type idle,
-		  int *all_pinned, int *this_best_prio)
-{
-	return __load_balance_fair(this_rq, this_cpu, busiest,
-			max_load_move, sd, idle, all_pinned,
-			this_best_prio, &busiest->cfs);
-}
-#endif
 
 static int
 move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 060e87b0cb1c..3432d573205d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -513,8 +513,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 	 */
 	for_each_sched_rt_entity(rt_se)
 		enqueue_rt_entity(rt_se);
-
-	inc_cpu_load(rq, p->se.load.weight);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -534,8 +532,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 		if (rt_rq && rt_rq->rt_nr_running)
 			enqueue_rt_entity(rt_se);
 	}
-
-	dec_cpu_load(rq, p->se.load.weight);
 }
 
 /*
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 5bae2e0c3ff2..a38878e0e49d 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -67,6 +67,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		preempt_enable();
 #endif
 	}
+	kfree(mask_str);
 	return 0;
 }
 
diff --git a/kernel/sched_trace.h b/kernel/sched_trace.h
new file mode 100644
index 000000000000..29b48f34fd02
--- /dev/null
+++ b/kernel/sched_trace.h
@@ -0,0 +1,41 @@
+#include <linux/marker.h>
+
+static inline void trace_kernel_sched_wait(struct task_struct *p)
+{
+	trace_mark(kernel_sched_wait_task, "pid %d state %ld",
+			p->pid, p->state);
+}
+
+static inline
+void trace_kernel_sched_wakeup(struct rq *rq, struct task_struct *p)
+{
+	trace_mark(kernel_sched_wakeup,
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			p->pid, p->state, rq, p, rq->curr);
+}
+
+static inline
+void trace_kernel_sched_wakeup_new(struct rq *rq, struct task_struct *p)
+{
+	trace_mark(kernel_sched_wakeup_new,
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			p->pid, p->state, rq, p, rq->curr);
+}
+
+static inline void trace_kernel_sched_switch(struct rq *rq,
+		struct task_struct *prev, struct task_struct *next)
+{
+	trace_mark(kernel_sched_schedule,
+			"prev_pid %d next_pid %d prev_state %ld "
+			"## rq %p prev %p next %p",
+			prev->pid, next->pid, prev->state,
+			rq, prev, next);
+}
+
+static inline void
+trace_kernel_sched_migrate_task(struct task_struct *p, int src, int dst)
+{
+	trace_mark(kernel_sched_migrate_task,
+			"pid %d state %ld dest_cpu %d",
+			p->pid, p->state, dst);
+}
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 5c2942e768cd..bbab232ee185 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -14,7 +14,7 @@
  * Some notes on the implementation:
  *
  * The spinlock controls access to the other members of the semaphore.
- * down_trylock() and up() can be called from interrupt context, so we
+ * down_try() and up() can be called from interrupt context, so we
  * have to disable interrupts when taking the lock.  It turns out various
  * parts of the kernel expect to be able to use down() on a semaphore in
  * interrupt context when they know it will succeed, so we have to use
@@ -31,6 +31,7 @@
 #include <linux/sched.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
+#include <linux/ftrace.h>
 
 static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
@@ -53,6 +54,7 @@ void down(struct semaphore *sem)
 {
 	unsigned long flags;
 
+	ftrace_special(sem->count, 0, __LINE__);
 	spin_lock_irqsave(&sem->lock, flags);
 	if (likely(sem->count > 0))
 		sem->count--;
@@ -114,19 +116,18 @@ int down_killable(struct semaphore *sem)
 EXPORT_SYMBOL(down_killable);
 
 /**
- * down_trylock - try to acquire the semaphore, without waiting
+ * down_try - try to acquire the semaphore, without waiting
  * @sem: the semaphore to be acquired
  *
- * Try to acquire the semaphore atomically.  Returns 0 if the mutex has
- * been acquired successfully or 1 if it it cannot be acquired.
+ * Try to acquire the semaphore atomically.  Returns true if the mutex has
+ * been acquired successfully or 0 if it it cannot be acquired.
  *
- * NOTE: This return value is inverted from both spin_trylock and
- * mutex_trylock!  Be careful about this when converting code.
+ * NOTE: This replaces down_trylock() which returned the reverse.
  *
  * Unlike mutex_trylock, this function can be used from interrupt context,
  * and the semaphore can be released by any task or interrupt.
  */
-int down_trylock(struct semaphore *sem)
+int down_try(struct semaphore *sem)
 {
 	unsigned long flags;
 	int count;
@@ -137,9 +138,9 @@ int down_trylock(struct semaphore *sem)
 		sem->count = count;
 	spin_unlock_irqrestore(&sem->lock, flags);
 
-	return (count < 0);
+	return (count >= 0);
 }
-EXPORT_SYMBOL(down_trylock);
+EXPORT_SYMBOL(down_try);
 
 /**
  * down_timeout - acquire the semaphore within a specified time
diff --git a/kernel/signal.c b/kernel/signal.c
index 72bb4f51f963..6c0958e52ea7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -231,6 +231,40 @@ void flush_signals(struct task_struct *t)
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
+static void __flush_itimer_signals(struct sigpending *pending)
+{
+	sigset_t signal, retain;
+	struct sigqueue *q, *n;
+
+	signal = pending->signal;
+	sigemptyset(&retain);
+
+	list_for_each_entry_safe(q, n, &pending->list, list) {
+		int sig = q->info.si_signo;
+
+		if (likely(q->info.si_code != SI_TIMER)) {
+			sigaddset(&retain, sig);
+		} else {
+			sigdelset(&signal, sig);
+			list_del_init(&q->list);
+			__sigqueue_free(q);
+		}
+	}
+
+	sigorsets(&pending->signal, &signal, &retain);
+}
+
+void flush_itimer_signals(void)
+{
+	struct task_struct *tsk = current;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tsk->sighand->siglock, flags);
+	__flush_itimer_signals(&tsk->pending);
+	__flush_itimer_signals(&tsk->signal->shared_pending);
+	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+}
+
 void ignore_signals(struct task_struct *t)
 {
 	int i;
@@ -1240,17 +1274,22 @@ void sigqueue_free(struct sigqueue *q)
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
 	/*
-	 * If the signal is still pending remove it from the
-	 * pending queue. We must hold ->siglock while testing
-	 * q->list to serialize with collect_signal().
+	 * We must hold ->siglock while testing q->list
+	 * to serialize with collect_signal() or with
+	 * __exit_signal()->flush_sigqueue().
 	 */
 	spin_lock_irqsave(lock, flags);
+	q->flags &= ~SIGQUEUE_PREALLOC;
+	/*
+	 * If it is queued it will be freed when dequeued,
+	 * like the "regular" sigqueue.
+	 */
 	if (!list_empty(&q->list))
-		list_del_init(&q->list);
+		q = NULL;
 	spin_unlock_irqrestore(lock, flags);
 
-	q->flags &= ~SIGQUEUE_PREALLOC;
-	__sigqueue_free(q);
+	if (q)
+		__sigqueue_free(q);
 }
 
 int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index ae28c8245123..a1fb54c93cdd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -436,7 +436,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
 }
 EXPORT_SYMBOL(_spin_trylock_bh);
 
-int in_lock_functions(unsigned long addr)
+notrace int in_lock_functions(unsigned long addr)
 {
 	/* Linker adds these: start and end of __lockfunc functions */
 	extern char __lock_text_start[], __lock_text_end[];
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 0101aeef7ed7..36e166def7c7 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -13,202 +13,171 @@
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
 
-/* Since we effect priority and affinity (both of which are visible
- * to, and settable by outside processes) we do indirection via a
- * kthread. */
-
-/* Thread to stop each CPU in user context. */
+/* This controls the threads on each CPU. */
 enum stopmachine_state {
-	STOPMACHINE_WAIT,
-	STOPMACHINE_PREPARE,
+	/* Dummy starting state for thread. */
+	STOPMACHINE_NONE,
+	/* Disable interrupts. */
 	STOPMACHINE_DISABLE_IRQ,
+	/* Run the function */
+	STOPMACHINE_RUN,
+	/* Exit */
 	STOPMACHINE_EXIT,
+	/* Everyone exited. */
+	STOPMACHINE_COMPLETE,
 };
+static enum stopmachine_state state;
 
-static enum stopmachine_state stopmachine_state;
-static unsigned int stopmachine_num_threads;
-static atomic_t stopmachine_thread_ack;
+struct stop_machine_data {
+	int (*fn)(void *);
+	void *data;
+	int fnret;
+};
+
+/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+static unsigned int num_threads;
+static atomic_t thread_ack;
+static struct completion finished;
 
-static int stopmachine(void *cpu)
+static void set_state(enum stopmachine_state newstate)
 {
-	int irqs_disabled = 0;
-	int prepared = 0;
+	/* Reset ack counter. */
+	atomic_set(&thread_ack, num_threads);
+	smp_wmb();
+	state = newstate;
+}
 
-	set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
+/* Last one to ack a state moves to the next state. */
+static void ack_state(void)
+{
+	if (atomic_dec_and_test(&thread_ack)) {
+		set_state(state + 1);
+		if (state == STOPMACHINE_COMPLETE)
+			complete(&finished);
+	}
+}
 
-	/* Ack: we are alive */
-	smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
-	atomic_inc(&stopmachine_thread_ack);
+/* This is the actual thread which stops the CPU.  It exits by itself rather
+ * than waiting for kthread_stop(), because it's easier for hotplug CPU. */
+static int stop_cpu(struct stop_machine_data *smdata)
+{
+	enum stopmachine_state curstate = STOPMACHINE_NONE;
+	int uninitialized_var(ret);
 
 	/* Simple state machine */
-	while (stopmachine_state != STOPMACHINE_EXIT) {
-		if (stopmachine_state == STOPMACHINE_DISABLE_IRQ 
-		    && !irqs_disabled) {
-			local_irq_disable();
-			hard_irq_disable();
-			irqs_disabled = 1;
-			/* Ack: irqs disabled. */
-			smp_mb(); /* Must read state first. */
-			atomic_inc(&stopmachine_thread_ack);
-		} else if (stopmachine_state == STOPMACHINE_PREPARE
-			   && !prepared) {
-			/* Everyone is in place, hold CPU. */
-			preempt_disable();
-			prepared = 1;
-			smp_mb(); /* Must read state first. */
-			atomic_inc(&stopmachine_thread_ack);
+	do {
+		/* Chill out and ensure we re-read stopmachine_state. */
+		cpu_relax();
+		if (state != curstate) {
+			curstate = state;
+			switch (curstate) {
+			case STOPMACHINE_DISABLE_IRQ:
+				local_irq_disable();
+				hard_irq_disable();
+				break;
+			case STOPMACHINE_RUN:
+				/* |= allows error detection if functions on
+				 * multiple CPUs. */
+				smdata->fnret |= smdata->fn(smdata->data);
+				break;
+			default:
+				break;
+			}
+			ack_state();
 		}
-		/* Yield in first stage: migration threads need to
-		 * help our sisters onto their CPUs. */
-		if (!prepared && !irqs_disabled)
-			yield();
-		else
-			cpu_relax();
-	}
-
-	/* Ack: we are exiting. */
-	smp_mb(); /* Must read state first. */
-	atomic_inc(&stopmachine_thread_ack);
+	} while (curstate < STOPMACHINE_EXIT);
 
-	if (irqs_disabled)
-		local_irq_enable();
-	if (prepared)
-		preempt_enable();
-
-	return 0;
+	local_irq_enable();
+	do_exit(0);
 }
 
-/* Change the thread state */
-static void stopmachine_set_state(enum stopmachine_state state)
+/* Callback for CPUs which aren't supposed to do anything. */
+static int chill(void *unused)
 {
-	atomic_set(&stopmachine_thread_ack, 0);
-	smp_wmb();
-	stopmachine_state = state;
-	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
-		cpu_relax();
+	return 0;
 }
 
-static int stop_machine(void)
+int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 {
-	int i, ret = 0;
-
-	atomic_set(&stopmachine_thread_ack, 0);
-	stopmachine_num_threads = 0;
-	stopmachine_state = STOPMACHINE_WAIT;
-
-	for_each_online_cpu(i) {
-		if (i == raw_smp_processor_id())
-			continue;
-		ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
-		if (ret < 0)
-			break;
-		stopmachine_num_threads++;
-	}
+	int i, err;
+	struct stop_machine_data active, idle;
+	struct task_struct **threads;
 
-	/* Wait for them all to come to life. */
-	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
-		yield();
+	active.fn = fn;
+	active.data = data;
+	active.fnret = 0;
+	idle.fn = chill;
+	idle.data = NULL;
 
-	/* If some failed, kill them all. */
-	if (ret < 0) {
-		stopmachine_set_state(STOPMACHINE_EXIT);
-		return ret;
-	}
-
-	/* Now they are all started, make them hold the CPUs, ready. */
-	preempt_disable();
-	stopmachine_set_state(STOPMACHINE_PREPARE);
+	/* If they don't care which cpu fn runs on, just pick one. */
+	if (cpu == NR_CPUS)
+		cpu = any_online_cpu(cpu_online_map);
 
-	/* Make them disable irqs. */
-	local_irq_disable();
-	hard_irq_disable();
-	stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
+	/* This could be too big for stack on large machines. */
+	threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
+	if (!threads)
+		return -ENOMEM;
 
-	return 0;
-}
+	/* Set up initial state. */
+	init_completion(&finished);
+	num_threads = num_online_cpus();
+	set_state(STOPMACHINE_DISABLE_IRQ);
 
-static void restart_machine(void)
-{
-	stopmachine_set_state(STOPMACHINE_EXIT);
-	local_irq_enable();
-	preempt_enable_no_resched();
-}
-
-struct stop_machine_data {
-	int (*fn)(void *);
-	void *data;
-	struct completion done;
-};
+	for_each_online_cpu(i) {
+		struct stop_machine_data *smdata;
+		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 
-static int do_stop(void *_smdata)
-{
-	struct stop_machine_data *smdata = _smdata;
-	int ret;
+		if (cpu == ALL_CPUS || i == cpu)
+			smdata = &active;
+		else
+			smdata = &idle;
 
-	ret = stop_machine();
-	if (ret == 0) {
-		ret = smdata->fn(smdata->data);
-		restart_machine();
-	}
+		threads[i] = kthread_create(stop_cpu, smdata, "kstop%u", i);
+		if (IS_ERR(threads[i])) {
+			err = PTR_ERR(threads[i]);
+			threads[i] = NULL;
+			goto kill_threads;
+		}
 
-	/* We're done: you can kthread_stop us now */
-	complete(&smdata->done);
+		/* Place it onto correct cpu. */
+		kthread_bind(threads[i], i);
 
-	/* Wait for kthread_stop */
-	set_current_state(TASK_INTERRUPTIBLE);
-	while (!kthread_should_stop()) {
-		schedule();
-		set_current_state(TASK_INTERRUPTIBLE);
+		/* Make it highest prio. */
+		if (sched_setscheduler(threads[i], SCHED_FIFO, &param) != 0)
+			BUG();
 	}
-	__set_current_state(TASK_RUNNING);
-	return ret;
-}
-
-struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
-				       unsigned int cpu)
-{
-	static DEFINE_MUTEX(stopmachine_mutex);
-	struct stop_machine_data smdata;
-	struct task_struct *p;
 
-	smdata.fn = fn;
-	smdata.data = data;
-	init_completion(&smdata.done);
+	/* We've created all the threads.  Wake them all: hold this CPU so one
+	 * doesn't hit this CPU until we're ready. */
+	cpu = get_cpu();
+	for_each_online_cpu(i)
+		wake_up_process(threads[i]);
 
-	mutex_lock(&stopmachine_mutex);
+	/* This will release the thread on our CPU. */
+	put_cpu();
+	wait_for_completion(&finished);
 
-	/* If they don't care which CPU fn runs on, bind to any online one. */
-	if (cpu == NR_CPUS)
-		cpu = raw_smp_processor_id();
+	kfree(threads);
 
-	p = kthread_create(do_stop, &smdata, "kstopmachine");
-	if (!IS_ERR(p)) {
-		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	return active.fnret;
 
-		/* One high-prio thread per cpu.  We'll do this one. */
-		sched_setscheduler(p, SCHED_FIFO, &param);
-		kthread_bind(p, cpu);
-		wake_up_process(p);
-		wait_for_completion(&smdata.done);
-	}
-	mutex_unlock(&stopmachine_mutex);
-	return p;
+kill_threads:
+	for_each_online_cpu(i)
+		if (threads[i])
+			kthread_stop(threads[i]);
+	kfree(threads);
+	return err;
 }
 
-int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
+int stop_machine_run_notype(int (*fn)(void *), void *data, unsigned int cpu)
 {
-	struct task_struct *p;
 	int ret;
 
 	/* No CPUs can come up or down during this. */
 	get_online_cpus();
-	p = __stop_machine_run(fn, data, cpu);
-	if (!IS_ERR(p))
-		ret = kthread_stop(p);
-	else
-		ret = PTR_ERR(p);
+	ret = __stop_machine_run(fn, data, cpu);
 	put_online_cpus();
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(stop_machine_run);
+EXPORT_SYMBOL_GPL(stop_machine_run_notype);
diff --git a/kernel/sys.c b/kernel/sys.c
index 895d2d4c9493..14e97282eb6c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1652,7 +1652,7 @@ asmlinkage long sys_umask(int mask)
 asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			  unsigned long arg4, unsigned long arg5)
 {
-	long uninitialized_var(error);
+	long error = 0;
 
 	if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
 		return error;
@@ -1701,9 +1701,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			error = PR_TIMING_STATISTICAL;
 			break;
 		case PR_SET_TIMING:
-			if (arg2 == PR_TIMING_STATISTICAL)
-				error = 0;
-			else
+			if (arg2 != PR_TIMING_STATISTICAL)
 				error = -EINVAL;
 			break;
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d7ffdc59816a..da2279d012d8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -46,6 +46,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 #include <linux/reboot.h>
+#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -81,6 +82,7 @@ extern int compat_log;
 extern int maps_protect;
 extern int sysctl_stat_interval;
 extern int latencytop_enabled;
+extern int sysctl_nr_open_min, sysctl_nr_open_max;
 
 /* Constants used for minimum and  maximum */
 #if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
@@ -131,8 +133,6 @@ extern int sysctl_userprocess_debug;
 extern int spin_retry;
 #endif
 
-extern int sysctl_hz_timer;
-
 #ifdef CONFIG_BSD_PROCESS_ACCT
 extern int acct_parm[];
 #endif
@@ -454,6 +454,16 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_FTRACE
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "ftrace_enabled",
+		.data		= &ftrace_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &ftrace_enable_sysctl,
+	},
+#endif
 #ifdef CONFIG_KMOD
 	{
 		.ctl_name	= KERN_MODPROBE,
@@ -562,16 +572,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_NO_IDLE_HZ
-	{
-		.ctl_name       = KERN_HZ_TIMER,
-		.procname       = "hz_timer",
-		.data           = &sysctl_hz_timer,
-		.maxlen         = sizeof(int),
-		.mode           = 0644,
-		.proc_handler   = &proc_dointvec,
-	},
-#endif
 	{
 		.ctl_name	= KERN_S390_USER_DEBUG_LOGGING,
 		.procname	= "userprocess_debug",
@@ -1190,7 +1190,9 @@ static struct ctl_table fs_table[] = {
 		.data		= &sysctl_nr_open,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &sysctl_nr_open_min,
+		.extra2		= &sysctl_nr_open_max,
 	},
 	{
 		.ctl_name	= FS_DENTRY,
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
deleted file mode 100644
index 06b6395b45b2..000000000000
--- a/kernel/test_kprobes.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * test_kprobes.c - simple sanity test for *probes
- *
- * Copyright IBM Corp. 2008
- *
- * This program is free software;  you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
- */
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/random.h>
-
-#define div_factor 3
-
-static u32 rand1, preh_val, posth_val, jph_val;
-static int errors, handler_errors, num_tests;
-
-static noinline u32 kprobe_target(u32 value)
-{
-	/*
-	 * gcc ignores noinline on some architectures unless we stuff
-	 * sufficient lard into the function. The get_kprobe() here is
-	 * just for that.
-	 *
-	 * NOTE: We aren't concerned about the correctness of get_kprobe()
-	 * here; hence, this call is neither under !preempt nor with the
-	 * kprobe_mutex held. This is fine(tm)
-	 */
-	if (get_kprobe((void *)0xdeadbeef))
-		printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
-
-	return (value / div_factor);
-}
-
-static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
-{
-	preh_val = (rand1 / div_factor);
-	return 0;
-}
-
-static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
-		unsigned long flags)
-{
-	if (preh_val != (rand1 / div_factor)) {
-		handler_errors++;
-		printk(KERN_ERR "Kprobe smoke test failed: "
-				"incorrect value in post_handler\n");
-	}
-	posth_val = preh_val + div_factor;
-}
-
-static struct kprobe kp = {
-	.symbol_name = "kprobe_target",
-	.pre_handler = kp_pre_handler,
-	.post_handler = kp_post_handler
-};
-
-static int test_kprobe(void)
-{
-	int ret;
-
-	ret = register_kprobe(&kp);
-	if (ret < 0) {
-		printk(KERN_ERR "Kprobe smoke test failed: "
-				"register_kprobe returned %d\n", ret);
-		return ret;
-	}
-
-	ret = kprobe_target(rand1);
-	unregister_kprobe(&kp);
-
-	if (preh_val == 0) {
-		printk(KERN_ERR "Kprobe smoke test failed: "
-				"kprobe pre_handler not called\n");
-		handler_errors++;
-	}
-
-	if (posth_val == 0) {
-		printk(KERN_ERR "Kprobe smoke test failed: "
-				"kprobe post_handler not called\n");
-		handler_errors++;
-	}
-
-	return 0;
-}
-
-static u32 j_kprobe_target(u32 value)
-{
-	if (value != rand1) {
-		handler_errors++;
-		printk(KERN_ERR "Kprobe smoke test failed: "
-				"incorrect value in jprobe handler\n");
-	}
-
-	jph_val = rand1;
-	jprobe_return();
-	return 0;
-}
-
-static struct jprobe jp = {
-	.entry		= j_kprobe_target,
-	.kp.symbol_name = "kprobe_target"
-};
-
-static int test_jprobe(void)
-{
-	int ret;
-
-	ret = register_jprobe(&jp);
-	if (ret < 0) {
-		printk(KERN_ERR "Kprobe smoke test failed: "
-				"register_jprobe returned %d\n", ret);
-		return ret;
-	}
-
-	ret = kprobe_target(rand1);
-	unregister_jprobe(&jp);
-	if (jph_val == 0) {
-		printk(KERN_ERR "Kprobe smoke test failed: "
-				"jprobe handler not called\n");
-		handler_errors++;
-	}
-
-	return 0;
-}
-
-#ifdef CONFIG_KRETPROBES
-static u32 krph_val;
-
-static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
-{
-	krph_val = (rand1 / div_factor);
-	return 0;
-}
-
-static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
-{
-	unsigned long ret = regs_return_value(regs);
-
-	if (ret != (rand1 / div_factor)) {
-		handler_errors++;
-		printk(KERN_ERR "Kprobe smoke test failed: "
-				"incorrect value in kretprobe handler\n");
-	}
-	if (krph_val == 0) {
-		handler_errors++;
-		printk(KERN_ERR "Kprobe smoke test failed: "
-				"call to kretprobe entry handler failed\n");
-	}
-
-	krph_val = rand1;
-	return 0;
-}
-
-static struct kretprobe rp = {
-	.handler	= return_handler,
-	.entry_handler  = entry_handler,
-	.kp.symbol_name = "kprobe_target"
-};
-
-static int test_kretprobe(void)
-{
-	int ret;
-
-	ret = register_kretprobe(&rp);
-	if (ret < 0) {
-		printk(KERN_ERR "Kprobe smoke test failed: "
-				"register_kretprobe returned %d\n", ret);
-		return ret;
-	}
-
-	ret = kprobe_target(rand1);
-	unregister_kretprobe(&rp);
-	if (krph_val != rand1) {
-		printk(KERN_ERR "Kprobe smoke test failed: "
-				"kretprobe handler not called\n");
-		handler_errors++;
-	}
-
-	return 0;
-}
-#endif /* CONFIG_KRETPROBES */
-
-int init_test_probes(void)
-{
-	int ret;
-
-	do {
-		rand1 = random32();
-	} while (rand1 <= div_factor);
-
-	printk(KERN_INFO "Kprobe smoke test started\n");
-	num_tests++;
-	ret = test_kprobe();
-	if (ret < 0)
-		errors++;
-
-	num_tests++;
-	ret = test_jprobe();
-	if (ret < 0)
-		errors++;
-
-#ifdef CONFIG_KRETPROBES
-	num_tests++;
-	ret = test_kretprobe();
-	if (ret < 0)
-		errors++;
-#endif /* CONFIG_KRETPROBES */
-
-	if (errors)
-		printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
-				"%d tests failed\n", errors, num_tests);
-	else if (handler_errors)
-		printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
-				"running handlers\n", handler_errors);
-	else
-		printk(KERN_INFO "Kprobe smoke test passed successfully\n");
-
-	return 0;
-}
diff --git a/kernel/timer.c b/kernel/timer.c
index ceacc6626572..ef3fa6906e8f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -812,7 +812,7 @@ static inline void __run_timers(struct tvec_base *base)
 	spin_unlock_irq(&base->lock);
 }
 
-#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
+#ifdef CONFIG_NO_HZ
 /*
  * Find out when the next timer event is due to happen. This
  * is used on S/390 to stop all activity when a cpus is idle.
@@ -947,14 +947,6 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 
 	return cmp_next_hrtimer_event(now, expires);
 }
-
-#ifdef CONFIG_NO_IDLE_HZ
-unsigned long next_timer_interrupt(void)
-{
-	return get_next_timer_interrupt(jiffies);
-}
-#endif
-
 #endif
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
new file mode 100644
index 000000000000..263e9e6bbd60
--- /dev/null
+++ b/kernel/trace/Kconfig
@@ -0,0 +1,135 @@
+#
+# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
+#
+config HAVE_FTRACE
+	bool
+
+config HAVE_DYNAMIC_FTRACE
+	bool
+
+config TRACER_MAX_TRACE
+	bool
+
+config TRACING
+	bool
+	select DEBUG_FS
+	select STACKTRACE
+
+config FTRACE
+	bool "Kernel Function Tracer"
+	depends on HAVE_FTRACE
+	select FRAME_POINTER
+	select TRACING
+	select CONTEXT_SWITCH_TRACER
+	help
+	  Enable the kernel to trace every kernel function. This is done
+	  by using a compiler feature to insert a small, 5-byte No-Operation
+	  instruction to the beginning of every kernel function, which NOP
+	  sequence is then dynamically patched into a tracer call when
+	  tracing is enabled by the administrator. If it's runtime disabled
+	  (the bootup default), then the overhead of the instructions is very
+	  small and not measurable even in micro-benchmarks.
+
+config IRQSOFF_TRACER
+	bool "Interrupts-off Latency Tracer"
+	default n
+	depends on TRACE_IRQFLAGS_SUPPORT
+	depends on GENERIC_TIME
+	depends on HAVE_FTRACE
+	select TRACE_IRQFLAGS
+	select TRACING
+	select TRACER_MAX_TRACE
+	help
+	  This option measures the time spent in irqs-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started
+	  via:
+
+	      echo 0 > /debugfs/tracing/tracing_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the preempt-off timing option can be
+	  used together or separately.)
+
+config PREEMPT_TRACER
+	bool "Preemption-off Latency Tracer"
+	default n
+	depends on GENERIC_TIME
+	depends on PREEMPT
+	depends on HAVE_FTRACE
+	select TRACING
+	select TRACER_MAX_TRACE
+	help
+	  This option measures the time spent in preemption off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started
+	  via:
+
+	      echo 0 > /debugfs/tracing/tracing_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the irqs-off timing option can be
+	  used together or separately.)
+
+config SYSPROF_TRACER
+	bool "Sysprof Tracer"
+	depends on X86
+	select TRACING
+	help
+	  This tracer provides the trace needed by the 'Sysprof' userspace
+	  tool.
+
+config SCHED_TRACER
+	bool "Scheduling Latency Tracer"
+	depends on HAVE_FTRACE
+	select TRACING
+	select CONTEXT_SWITCH_TRACER
+	select TRACER_MAX_TRACE
+	help
+	  This tracer tracks the latency of the highest priority task
+	  to be scheduled in, starting from the point it has woken up.
+
+config CONTEXT_SWITCH_TRACER
+	bool "Trace process context switches"
+	depends on HAVE_FTRACE
+	select TRACING
+	select MARKERS
+	help
+	  This tracer gets called from the context switch and records
+	  all switching of tasks.
+
+config DYNAMIC_FTRACE
+	bool "enable/disable ftrace tracepoints dynamically"
+	depends on FTRACE
+	depends on HAVE_DYNAMIC_FTRACE
+	default y
+	help
+         This option will modify all the calls to ftrace dynamically
+	 (will patch them out of the binary image and replaces them
+	 with a No-Op instruction) as they are called. A table is
+	 created to dynamically enable them again.
+
+	 This way a CONFIG_FTRACE kernel is slightly larger, but otherwise
+	 has native performance as long as no tracing is active.
+
+	 The changes to the code are done by a kernel thread that
+	 wakes up once a second and checks to see if any ftrace calls
+	 were made. If so, it runs stop_machine (stops all CPUS)
+	 and modifies the code to jump over the call to ftrace.
+
+config FTRACE_SELFTEST
+	bool
+
+config FTRACE_STARTUP_TEST
+	bool "Perform a startup test on ftrace"
+	depends on TRACING
+	select FTRACE_SELFTEST
+	help
+	  This option performs a series of startup tests on ftrace. On bootup
+	  a series of tests are made to verify that the tracer is
+	  functioning properly. It will do tests on all the configured
+	  tracers of ftrace.
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
new file mode 100644
index 000000000000..71d17de17288
--- /dev/null
+++ b/kernel/trace/Makefile
@@ -0,0 +1,24 @@
+
+# Do not instrument the tracer itself:
+
+ifdef CONFIG_FTRACE
+ORIG_CFLAGS := $(KBUILD_CFLAGS)
+KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
+
+# selftest needs instrumentation
+CFLAGS_trace_selftest_dynamic.o = -pg
+obj-y += trace_selftest_dynamic.o
+endif
+
+obj-$(CONFIG_FTRACE) += libftrace.o
+
+obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
+obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
+obj-$(CONFIG_FTRACE) += trace_functions.o
+obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
+
+libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
new file mode 100644
index 000000000000..f762f5a2d331
--- /dev/null
+++ b/kernel/trace/ftrace.c
@@ -0,0 +1,1553 @@
+/*
+ * Infrastructure for profiling code inserted by 'gcc -pg'.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally ported from the -rt patch by:
+ *   Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/stop_machine.h>
+#include <linux/clocksource.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/hardirq.h>
+#include <linux/kthread.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/sysctl.h>
+#include <linux/ctype.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+
+#include "trace.h"
+
+/* ftrace_enabled is a method to turn ftrace on or off */
+int ftrace_enabled __read_mostly;
+static int last_ftrace_enabled;
+
+/*
+ * ftrace_disabled is set when an anomaly is discovered.
+ * ftrace_disabled is much stronger than ftrace_enabled.
+ */
+static int ftrace_disabled __read_mostly;
+
+static DEFINE_SPINLOCK(ftrace_lock);
+static DEFINE_MUTEX(ftrace_sysctl_lock);
+
+static struct ftrace_ops ftrace_list_end __read_mostly =
+{
+	.func = ftrace_stub,
+};
+
+static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
+ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+
+void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+{
+	struct ftrace_ops *op = ftrace_list;
+
+	/* in case someone actually ports this to alpha! */
+	read_barrier_depends();
+
+	while (op != &ftrace_list_end) {
+		/* silly alpha */
+		read_barrier_depends();
+		op->func(ip, parent_ip);
+		op = op->next;
+	};
+}
+
+/**
+ * clear_ftrace_function - reset the ftrace function
+ *
+ * This NULLs the ftrace function and in essence stops
+ * tracing.  There may be lag
+ */
+void clear_ftrace_function(void)
+{
+	ftrace_trace_function = ftrace_stub;
+}
+
+static int __register_ftrace_function(struct ftrace_ops *ops)
+{
+	/* Should never be called by interrupts */
+	spin_lock(&ftrace_lock);
+
+	ops->next = ftrace_list;
+	/*
+	 * We are entering ops into the ftrace_list but another
+	 * CPU might be walking that list. We need to make sure
+	 * the ops->next pointer is valid before another CPU sees
+	 * the ops pointer included into the ftrace_list.
+	 */
+	smp_wmb();
+	ftrace_list = ops;
+
+	if (ftrace_enabled) {
+		/*
+		 * For one func, simply call it directly.
+		 * For more than one func, call the chain.
+		 */
+		if (ops->next == &ftrace_list_end)
+			ftrace_trace_function = ops->func;
+		else
+			ftrace_trace_function = ftrace_list_func;
+	}
+
+	spin_unlock(&ftrace_lock);
+
+	return 0;
+}
+
+static int __unregister_ftrace_function(struct ftrace_ops *ops)
+{
+	struct ftrace_ops **p;
+	int ret = 0;
+
+	spin_lock(&ftrace_lock);
+
+	/*
+	 * If we are removing the last function, then simply point
+	 * to the ftrace_stub.
+	 */
+	if (ftrace_list == ops && ops->next == &ftrace_list_end) {
+		ftrace_trace_function = ftrace_stub;
+		ftrace_list = &ftrace_list_end;
+		goto out;
+	}
+
+	for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
+		if (*p == ops)
+			break;
+
+	if (*p != ops) {
+		ret = -1;
+		goto out;
+	}
+
+	*p = (*p)->next;
+
+	if (ftrace_enabled) {
+		/* If we only have one func left, then call that directly */
+		if (ftrace_list == &ftrace_list_end ||
+		    ftrace_list->next == &ftrace_list_end)
+			ftrace_trace_function = ftrace_list->func;
+	}
+
+ out:
+	spin_unlock(&ftrace_lock);
+
+	return ret;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static struct task_struct *ftraced_task;
+
+enum {
+	FTRACE_ENABLE_CALLS		= (1 << 0),
+	FTRACE_DISABLE_CALLS		= (1 << 1),
+	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
+	FTRACE_ENABLE_MCOUNT		= (1 << 3),
+	FTRACE_DISABLE_MCOUNT		= (1 << 4),
+};
+
+static int ftrace_filtered;
+
+static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
+
+static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
+
+static DEFINE_SPINLOCK(ftrace_shutdown_lock);
+static DEFINE_MUTEX(ftraced_lock);
+static DEFINE_MUTEX(ftrace_regex_lock);
+
+struct ftrace_page {
+	struct ftrace_page	*next;
+	unsigned long		index;
+	struct dyn_ftrace	records[];
+};
+
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT		10000
+
+static struct ftrace_page	*ftrace_pages_start;
+static struct ftrace_page	*ftrace_pages;
+
+static int ftraced_trigger;
+static int ftraced_suspend;
+static int ftraced_stop;
+
+static int ftrace_record_suspend;
+
+static struct dyn_ftrace *ftrace_free_records;
+
+static inline int
+ftrace_ip_in_hash(unsigned long ip, unsigned long key)
+{
+	struct dyn_ftrace *p;
+	struct hlist_node *t;
+	int found = 0;
+
+	hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) {
+		if (p->ip == ip) {
+			found = 1;
+			break;
+		}
+	}
+
+	return found;
+}
+
+static inline void
+ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
+{
+	hlist_add_head_rcu(&node->node, &ftrace_hash[key]);
+}
+
+static void ftrace_free_rec(struct dyn_ftrace *rec)
+{
+	/* no locking, only called from kstop_machine */
+
+	rec->ip = (unsigned long)ftrace_free_records;
+	ftrace_free_records = rec;
+	rec->flags |= FTRACE_FL_FREE;
+}
+
+static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
+{
+	struct dyn_ftrace *rec;
+
+	/* First check for freed records */
+	if (ftrace_free_records) {
+		rec = ftrace_free_records;
+
+		if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
+			WARN_ON_ONCE(1);
+			ftrace_free_records = NULL;
+			ftrace_disabled = 1;
+			ftrace_enabled = 0;
+			return NULL;
+		}
+
+		ftrace_free_records = (void *)rec->ip;
+		memset(rec, 0, sizeof(*rec));
+		return rec;
+	}
+
+	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
+		if (!ftrace_pages->next)
+			return NULL;
+		ftrace_pages = ftrace_pages->next;
+	}
+
+	return &ftrace_pages->records[ftrace_pages->index++];
+}
+
+static void
+ftrace_record_ip(unsigned long ip)
+{
+	struct dyn_ftrace *node;
+	unsigned long flags;
+	unsigned long key;
+	int resched;
+	int atomic;
+	int cpu;
+
+	if (!ftrace_enabled || ftrace_disabled)
+		return;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+
+	/*
+	 * We simply need to protect against recursion.
+	 * Use the the raw version of smp_processor_id and not
+	 * __get_cpu_var which can call debug hooks that can
+	 * cause a recursive crash here.
+	 */
+	cpu = raw_smp_processor_id();
+	per_cpu(ftrace_shutdown_disable_cpu, cpu)++;
+	if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1)
+		goto out;
+
+	if (unlikely(ftrace_record_suspend))
+		goto out;
+
+	key = hash_long(ip, FTRACE_HASHBITS);
+
+	WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
+
+	if (ftrace_ip_in_hash(ip, key))
+		goto out;
+
+	atomic = irqs_disabled();
+
+	spin_lock_irqsave(&ftrace_shutdown_lock, flags);
+
+	/* This ip may have hit the hash before the lock */
+	if (ftrace_ip_in_hash(ip, key))
+		goto out_unlock;
+
+	/*
+	 * There's a slight race that the ftraced will update the
+	 * hash and reset here. If it is already converted, skip it.
+	 */
+	if (ftrace_ip_converted(ip))
+		goto out_unlock;
+
+	node = ftrace_alloc_dyn_node(ip);
+	if (!node)
+		goto out_unlock;
+
+	node->ip = ip;
+
+	ftrace_add_hash(node, key);
+
+	ftraced_trigger = 1;
+
+ out_unlock:
+	spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
+ out:
+	per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
+
+	/* prevent recursion with scheduler */
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+#define FTRACE_ADDR ((long)(ftrace_caller))
+#define MCOUNT_ADDR ((long)(mcount))
+
+static void
+__ftrace_replace_code(struct dyn_ftrace *rec,
+		      unsigned char *old, unsigned char *new, int enable)
+{
+	unsigned long ip, fl;
+	int failed;
+
+	ip = rec->ip;
+
+	if (ftrace_filtered && enable) {
+		/*
+		 * If filtering is on:
+		 *
+		 * If this record is set to be filtered and
+		 * is enabled then do nothing.
+		 *
+		 * If this record is set to be filtered and
+		 * it is not enabled, enable it.
+		 *
+		 * If this record is not set to be filtered
+		 * and it is not enabled do nothing.
+		 *
+		 * If this record is set not to trace then
+		 * do nothing.
+		 *
+		 * If this record is not set to be filtered and
+		 * it is enabled, disable it.
+		 */
+		fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
+
+		if ((fl ==  (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
+		    (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE))
+			return;
+
+		/*
+		 * If it is enabled disable it,
+		 * otherwise enable it!
+		 */
+		if (fl == FTRACE_FL_ENABLED) {
+			/* swap new and old */
+			new = old;
+			old = ftrace_call_replace(ip, FTRACE_ADDR);
+			rec->flags &= ~FTRACE_FL_ENABLED;
+		} else {
+			new = ftrace_call_replace(ip, FTRACE_ADDR);
+			rec->flags |= FTRACE_FL_ENABLED;
+		}
+	} else {
+
+		if (enable) {
+			/*
+			 * If this record is set not to trace and is
+			 * not enabled, do nothing.
+			 */
+			fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
+			if (fl == FTRACE_FL_NOTRACE)
+				return;
+
+			new = ftrace_call_replace(ip, FTRACE_ADDR);
+		} else
+			old = ftrace_call_replace(ip, FTRACE_ADDR);
+
+		if (enable) {
+			if (rec->flags & FTRACE_FL_ENABLED)
+				return;
+			rec->flags |= FTRACE_FL_ENABLED;
+		} else {
+			if (!(rec->flags & FTRACE_FL_ENABLED))
+				return;
+			rec->flags &= ~FTRACE_FL_ENABLED;
+		}
+	}
+
+	failed = ftrace_modify_code(ip, old, new);
+	if (failed) {
+		unsigned long key;
+		/* It is possible that the function hasn't been converted yet */
+		key = hash_long(ip, FTRACE_HASHBITS);
+		if (!ftrace_ip_in_hash(ip, key)) {
+			rec->flags |= FTRACE_FL_FAILED;
+			ftrace_free_rec(rec);
+		}
+
+	}
+}
+
+static void ftrace_replace_code(int enable)
+{
+	unsigned char *new = NULL, *old = NULL;
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+	int i;
+
+	if (enable)
+		old = ftrace_nop_replace();
+	else
+		new = ftrace_nop_replace();
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+
+			/* don't modify code that has already faulted */
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+
+			__ftrace_replace_code(rec, old, new, enable);
+		}
+	}
+}
+
+static void ftrace_shutdown_replenish(void)
+{
+	if (ftrace_pages->next)
+		return;
+
+	/* allocate another page */
+	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
+}
+
+static int
+ftrace_code_disable(struct dyn_ftrace *rec)
+{
+	unsigned long ip;
+	unsigned char *nop, *call;
+	int failed;
+
+	ip = rec->ip;
+
+	nop = ftrace_nop_replace();
+	call = ftrace_call_replace(ip, MCOUNT_ADDR);
+
+	failed = ftrace_modify_code(ip, call, nop);
+	if (failed) {
+		rec->flags |= FTRACE_FL_FAILED;
+		ftrace_free_rec(rec);
+		return 0;
+	}
+	return 1;
+}
+
+static int __ftrace_update_code(void *ignore);
+
+static int __ftrace_modify_code(void *data)
+{
+	unsigned long addr;
+	int *command = data;
+
+	if (*command & FTRACE_ENABLE_CALLS) {
+		/*
+		 * Update any recorded ips now that we have the
+		 * machine stopped
+		 */
+		__ftrace_update_code(NULL);
+		ftrace_replace_code(1);
+	} else if (*command & FTRACE_DISABLE_CALLS)
+		ftrace_replace_code(0);
+
+	if (*command & FTRACE_UPDATE_TRACE_FUNC)
+		ftrace_update_ftrace_func(ftrace_trace_function);
+
+	if (*command & FTRACE_ENABLE_MCOUNT) {
+		addr = (unsigned long)ftrace_record_ip;
+		ftrace_mcount_set(&addr);
+	} else if (*command & FTRACE_DISABLE_MCOUNT) {
+		addr = (unsigned long)ftrace_stub;
+		ftrace_mcount_set(&addr);
+	}
+
+	return 0;
+}
+
+static void ftrace_run_update_code(int command)
+{
+	stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
+}
+
+void ftrace_disable_daemon(void)
+{
+	/* Stop the daemon from calling kstop_machine */
+	mutex_lock(&ftraced_lock);
+	ftraced_stop = 1;
+	mutex_unlock(&ftraced_lock);
+
+	ftrace_force_update();
+}
+
+void ftrace_enable_daemon(void)
+{
+	mutex_lock(&ftraced_lock);
+	ftraced_stop = 0;
+	mutex_unlock(&ftraced_lock);
+
+	ftrace_force_update();
+}
+
+static ftrace_func_t saved_ftrace_func;
+
+static void ftrace_startup(void)
+{
+	int command = 0;
+
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftraced_lock);
+	ftraced_suspend++;
+	if (ftraced_suspend == 1)
+		command |= FTRACE_ENABLE_CALLS;
+
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
+
+	if (!command || !ftrace_enabled)
+		goto out;
+
+	ftrace_run_update_code(command);
+ out:
+	mutex_unlock(&ftraced_lock);
+}
+
+static void ftrace_shutdown(void)
+{
+	int command = 0;
+
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftraced_lock);
+	ftraced_suspend--;
+	if (!ftraced_suspend)
+		command |= FTRACE_DISABLE_CALLS;
+
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
+
+	if (!command || !ftrace_enabled)
+		goto out;
+
+	ftrace_run_update_code(command);
+ out:
+	mutex_unlock(&ftraced_lock);
+}
+
+static void ftrace_startup_sysctl(void)
+{
+	int command = FTRACE_ENABLE_MCOUNT;
+
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftraced_lock);
+	/* Force update next time */
+	saved_ftrace_func = NULL;
+	/* ftraced_suspend is true if we want ftrace running */
+	if (ftraced_suspend)
+		command |= FTRACE_ENABLE_CALLS;
+
+	ftrace_run_update_code(command);
+	mutex_unlock(&ftraced_lock);
+}
+
+static void ftrace_shutdown_sysctl(void)
+{
+	int command = FTRACE_DISABLE_MCOUNT;
+
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftraced_lock);
+	/* ftraced_suspend is true if ftrace is running */
+	if (ftraced_suspend)
+		command |= FTRACE_DISABLE_CALLS;
+
+	ftrace_run_update_code(command);
+	mutex_unlock(&ftraced_lock);
+}
+
+static cycle_t		ftrace_update_time;
+static unsigned long	ftrace_update_cnt;
+unsigned long		ftrace_update_tot_cnt;
+
+static int __ftrace_update_code(void *ignore)
+{
+	struct dyn_ftrace *p;
+	struct hlist_head head;
+	struct hlist_node *t;
+	int save_ftrace_enabled;
+	cycle_t start, stop;
+	int i;
+
+	/* Don't be recording funcs now */
+	ftrace_record_suspend++;
+	save_ftrace_enabled = ftrace_enabled;
+	ftrace_enabled = 0;
+
+	start = ftrace_now(raw_smp_processor_id());
+	ftrace_update_cnt = 0;
+
+	/* No locks needed, the machine is stopped! */
+	for (i = 0; i < FTRACE_HASHSIZE; i++) {
+		if (hlist_empty(&ftrace_hash[i]))
+			continue;
+
+		head = ftrace_hash[i];
+		INIT_HLIST_HEAD(&ftrace_hash[i]);
+
+		/* all CPUS are stopped, we are safe to modify code */
+		hlist_for_each_entry(p, t, &head, node) {
+			if (ftrace_code_disable(p))
+				ftrace_update_cnt++;
+		}
+
+	}
+
+	stop = ftrace_now(raw_smp_processor_id());
+	ftrace_update_time = stop - start;
+	ftrace_update_tot_cnt += ftrace_update_cnt;
+	ftraced_trigger = 0;
+
+	ftrace_enabled = save_ftrace_enabled;
+	ftrace_record_suspend--;
+
+	return 0;
+}
+
+static int ftrace_update_code(void)
+{
+	if (unlikely(ftrace_disabled) ||
+	    !ftrace_enabled || !ftraced_trigger)
+		return 0;
+
+	stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+
+	return 1;
+}
+
+static int ftraced(void *ignore)
+{
+	unsigned long usecs;
+
+	while (!kthread_should_stop()) {
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		/* check once a second */
+		schedule_timeout(HZ);
+
+		if (unlikely(ftrace_disabled))
+			continue;
+
+		mutex_lock(&ftrace_sysctl_lock);
+		mutex_lock(&ftraced_lock);
+		if (!ftraced_suspend && !ftraced_stop &&
+		    ftrace_update_code()) {
+			usecs = nsecs_to_usecs(ftrace_update_time);
+			if (ftrace_update_tot_cnt > 100000) {
+				ftrace_update_tot_cnt = 0;
+				pr_info("hm, dftrace overflow: %lu change%s"
+					" (%lu total) in %lu usec%s\n",
+					ftrace_update_cnt,
+					ftrace_update_cnt != 1 ? "s" : "",
+					ftrace_update_tot_cnt,
+					usecs, usecs != 1 ? "s" : "");
+				ftrace_disabled = 1;
+				WARN_ON_ONCE(1);
+			}
+		}
+		mutex_unlock(&ftraced_lock);
+		mutex_unlock(&ftrace_sysctl_lock);
+
+		ftrace_shutdown_replenish();
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int __init ftrace_dyn_table_alloc(void)
+{
+	struct ftrace_page *pg;
+	int cnt;
+	int i;
+
+	/* allocate a few pages */
+	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!ftrace_pages_start)
+		return -1;
+
+	/*
+	 * Allocate a few more pages.
+	 *
+	 * TODO: have some parser search vmlinux before
+	 *   final linking to find all calls to ftrace.
+	 *   Then we can:
+	 *    a) know how many pages to allocate.
+	 *     and/or
+	 *    b) set up the table then.
+	 *
+	 *  The dynamic code is still necessary for
+	 *  modules.
+	 */
+
+	pg = ftrace_pages = ftrace_pages_start;
+
+	cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
+
+	for (i = 0; i < cnt; i++) {
+		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+
+		/* If we fail, we'll try later anyway */
+		if (!pg->next)
+			break;
+
+		pg = pg->next;
+	}
+
+	return 0;
+}
+
+enum {
+	FTRACE_ITER_FILTER	= (1 << 0),
+	FTRACE_ITER_CONT	= (1 << 1),
+	FTRACE_ITER_NOTRACE	= (1 << 2),
+};
+
+#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
+
+struct ftrace_iterator {
+	loff_t			pos;
+	struct ftrace_page	*pg;
+	unsigned		idx;
+	unsigned		flags;
+	unsigned char		buffer[FTRACE_BUFF_MAX+1];
+	unsigned		buffer_idx;
+	unsigned		filtered;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	struct dyn_ftrace *rec = NULL;
+
+	(*pos)++;
+
+ retry:
+	if (iter->idx >= iter->pg->index) {
+		if (iter->pg->next) {
+			iter->pg = iter->pg->next;
+			iter->idx = 0;
+			goto retry;
+		}
+	} else {
+		rec = &iter->pg->records[iter->idx++];
+		if ((rec->flags & FTRACE_FL_FAILED) ||
+		    ((iter->flags & FTRACE_ITER_FILTER) &&
+		     !(rec->flags & FTRACE_FL_FILTER)) ||
+		    ((iter->flags & FTRACE_ITER_NOTRACE) &&
+		     !(rec->flags & FTRACE_FL_NOTRACE))) {
+			rec = NULL;
+			goto retry;
+		}
+	}
+
+	iter->pos = *pos;
+
+	return rec;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	void *p = NULL;
+	loff_t l = -1;
+
+	if (*pos != iter->pos) {
+		for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l))
+			;
+	} else {
+		l = *pos;
+		p = t_next(m, p, &l);
+	}
+
+	return p;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct dyn_ftrace *rec = v;
+	char str[KSYM_SYMBOL_LEN];
+
+	if (!rec)
+		return 0;
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+
+	seq_printf(m, "%s\n", str);
+
+	return 0;
+}
+
+static struct seq_operations show_ftrace_seq_ops = {
+	.start = t_start,
+	.next = t_next,
+	.stop = t_stop,
+	.show = t_show,
+};
+
+static int
+ftrace_avail_open(struct inode *inode, struct file *file)
+{
+	struct ftrace_iterator *iter;
+	int ret;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
+	iter->pg = ftrace_pages_start;
+	iter->pos = -1;
+
+	ret = seq_open(file, &show_ftrace_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+
+		m->private = iter;
+	} else {
+		kfree(iter);
+	}
+
+	return ret;
+}
+
+int ftrace_avail_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct ftrace_iterator *iter = m->private;
+
+	seq_release(inode, file);
+	kfree(iter);
+
+	return 0;
+}
+
+static void ftrace_filter_reset(int enable)
+{
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+	unsigned i;
+
+	/* keep kstop machine from running */
+	preempt_disable();
+	if (enable)
+		ftrace_filtered = 0;
+	pg = ftrace_pages_start;
+	while (pg) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+			rec->flags &= ~type;
+		}
+		pg = pg->next;
+	}
+	preempt_enable();
+}
+
+static int
+ftrace_regex_open(struct inode *inode, struct file *file, int enable)
+{
+	struct ftrace_iterator *iter;
+	int ret = 0;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
+	mutex_lock(&ftrace_regex_lock);
+	if ((file->f_mode & FMODE_WRITE) &&
+	    !(file->f_flags & O_APPEND))
+		ftrace_filter_reset(enable);
+
+	if (file->f_mode & FMODE_READ) {
+		iter->pg = ftrace_pages_start;
+		iter->pos = -1;
+		iter->flags = enable ? FTRACE_ITER_FILTER :
+			FTRACE_ITER_NOTRACE;
+
+		ret = seq_open(file, &show_ftrace_seq_ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = iter;
+		} else
+			kfree(iter);
+	} else
+		file->private_data = iter;
+	mutex_unlock(&ftrace_regex_lock);
+
+	return ret;
+}
+
+static int
+ftrace_filter_open(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_open(inode, file, 1);
+}
+
+static int
+ftrace_notrace_open(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_open(inode, file, 0);
+}
+
+static ssize_t
+ftrace_regex_read(struct file *file, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	if (file->f_mode & FMODE_READ)
+		return seq_read(file, ubuf, cnt, ppos);
+	else
+		return -EPERM;
+}
+
+static loff_t
+ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t ret;
+
+	if (file->f_mode & FMODE_READ)
+		ret = seq_lseek(file, offset, origin);
+	else
+		file->f_pos = ret = 1;
+
+	return ret;
+}
+
+enum {
+	MATCH_FULL,
+	MATCH_FRONT_ONLY,
+	MATCH_MIDDLE_ONLY,
+	MATCH_END_ONLY,
+};
+
+static void
+ftrace_match(unsigned char *buff, int len, int enable)
+{
+	char str[KSYM_SYMBOL_LEN];
+	char *search = NULL;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	int type = MATCH_FULL;
+	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+	unsigned i, match = 0, search_len = 0;
+
+	for (i = 0; i < len; i++) {
+		if (buff[i] == '*') {
+			if (!i) {
+				search = buff + i + 1;
+				type = MATCH_END_ONLY;
+				search_len = len - (i + 1);
+			} else {
+				if (type == MATCH_END_ONLY) {
+					type = MATCH_MIDDLE_ONLY;
+				} else {
+					match = i;
+					type = MATCH_FRONT_ONLY;
+				}
+				buff[i] = 0;
+				break;
+			}
+		}
+	}
+
+	/* keep kstop machine from running */
+	preempt_disable();
+	if (enable)
+		ftrace_filtered = 1;
+	pg = ftrace_pages_start;
+	while (pg) {
+		for (i = 0; i < pg->index; i++) {
+			int matched = 0;
+			char *ptr;
+
+			rec = &pg->records[i];
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+			switch (type) {
+			case MATCH_FULL:
+				if (strcmp(str, buff) == 0)
+					matched = 1;
+				break;
+			case MATCH_FRONT_ONLY:
+				if (memcmp(str, buff, match) == 0)
+					matched = 1;
+				break;
+			case MATCH_MIDDLE_ONLY:
+				if (strstr(str, search))
+					matched = 1;
+				break;
+			case MATCH_END_ONLY:
+				ptr = strstr(str, search);
+				if (ptr && (ptr[search_len] == 0))
+					matched = 1;
+				break;
+			}
+			if (matched)
+				rec->flags |= flag;
+		}
+		pg = pg->next;
+	}
+	preempt_enable();
+}
+
+static ssize_t
+ftrace_regex_write(struct file *file, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos, int enable)
+{
+	struct ftrace_iterator *iter;
+	char ch;
+	size_t read = 0;
+	ssize_t ret;
+
+	if (!cnt || cnt < 0)
+		return 0;
+
+	mutex_lock(&ftrace_regex_lock);
+
+	if (file->f_mode & FMODE_READ) {
+		struct seq_file *m = file->private_data;
+		iter = m->private;
+	} else
+		iter = file->private_data;
+
+	if (!*ppos) {
+		iter->flags &= ~FTRACE_ITER_CONT;
+		iter->buffer_idx = 0;
+	}
+
+	ret = get_user(ch, ubuf++);
+	if (ret)
+		goto out;
+	read++;
+	cnt--;
+
+	if (!(iter->flags & ~FTRACE_ITER_CONT)) {
+		/* skip white space */
+		while (cnt && isspace(ch)) {
+			ret = get_user(ch, ubuf++);
+			if (ret)
+				goto out;
+			read++;
+			cnt--;
+		}
+
+		if (isspace(ch)) {
+			file->f_pos += read;
+			ret = read;
+			goto out;
+		}
+
+		iter->buffer_idx = 0;
+	}
+
+	while (cnt && !isspace(ch)) {
+		if (iter->buffer_idx < FTRACE_BUFF_MAX)
+			iter->buffer[iter->buffer_idx++] = ch;
+		else {
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			goto out;
+		read++;
+		cnt--;
+	}
+
+	if (isspace(ch)) {
+		iter->filtered++;
+		iter->buffer[iter->buffer_idx] = 0;
+		ftrace_match(iter->buffer, iter->buffer_idx, enable);
+		iter->buffer_idx = 0;
+	} else
+		iter->flags |= FTRACE_ITER_CONT;
+
+
+	file->f_pos += read;
+
+	ret = read;
+ out:
+	mutex_unlock(&ftrace_regex_lock);
+
+	return ret;
+}
+
+static ssize_t
+ftrace_filter_write(struct file *file, const char __user *ubuf,
+		    size_t cnt, loff_t *ppos)
+{
+	return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
+}
+
+static ssize_t
+ftrace_notrace_write(struct file *file, const char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
+}
+
+static void
+ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
+{
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftrace_regex_lock);
+	if (reset)
+		ftrace_filter_reset(enable);
+	if (buf)
+		ftrace_match(buf, len, enable);
+	mutex_unlock(&ftrace_regex_lock);
+}
+
+/**
+ * ftrace_set_filter - set a function to filter on in ftrace
+ * @buf - the string that holds the function filter text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled.
+ * If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ */
+void ftrace_set_filter(unsigned char *buf, int len, int reset)
+{
+	ftrace_set_regex(buf, len, reset, 1);
+}
+
+/**
+ * ftrace_set_notrace - set a function to not trace in ftrace
+ * @buf - the string that holds the function notrace text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Notrace Filters denote which functions should not be enabled when tracing
+ * is enabled. If @buf is NULL and reset is set, all functions will be enabled
+ * for tracing.
+ */
+void ftrace_set_notrace(unsigned char *buf, int len, int reset)
+{
+	ftrace_set_regex(buf, len, reset, 0);
+}
+
+static int
+ftrace_regex_release(struct inode *inode, struct file *file, int enable)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct ftrace_iterator *iter;
+
+	mutex_lock(&ftrace_regex_lock);
+	if (file->f_mode & FMODE_READ) {
+		iter = m->private;
+
+		seq_release(inode, file);
+	} else
+		iter = file->private_data;
+
+	if (iter->buffer_idx) {
+		iter->filtered++;
+		iter->buffer[iter->buffer_idx] = 0;
+		ftrace_match(iter->buffer, iter->buffer_idx, enable);
+	}
+
+	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftraced_lock);
+	if (iter->filtered && ftraced_suspend && ftrace_enabled)
+		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+	mutex_unlock(&ftraced_lock);
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	kfree(iter);
+	mutex_unlock(&ftrace_regex_lock);
+	return 0;
+}
+
+static int
+ftrace_filter_release(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_release(inode, file, 1);
+}
+
+static int
+ftrace_notrace_release(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_release(inode, file, 0);
+}
+
+static ssize_t
+ftraced_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	/* don't worry about races */
+	char *buf = ftraced_stop ? "disabled\n" : "enabled\n";
+	int r = strlen(buf);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+ftraced_write(struct file *filp, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	if (strncmp(buf, "enable", 6) == 0)
+		val = 1;
+	else if (strncmp(buf, "disable", 7) == 0)
+		val = 0;
+	else {
+		buf[cnt] = 0;
+
+		ret = strict_strtoul(buf, 10, &val);
+		if (ret < 0)
+			return ret;
+
+		val = !!val;
+	}
+
+	if (val)
+		ftrace_enable_daemon();
+	else
+		ftrace_disable_daemon();
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static struct file_operations ftrace_avail_fops = {
+	.open = ftrace_avail_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = ftrace_avail_release,
+};
+
+static struct file_operations ftrace_filter_fops = {
+	.open = ftrace_filter_open,
+	.read = ftrace_regex_read,
+	.write = ftrace_filter_write,
+	.llseek = ftrace_regex_lseek,
+	.release = ftrace_filter_release,
+};
+
+static struct file_operations ftrace_notrace_fops = {
+	.open = ftrace_notrace_open,
+	.read = ftrace_regex_read,
+	.write = ftrace_notrace_write,
+	.llseek = ftrace_regex_lseek,
+	.release = ftrace_notrace_release,
+};
+
+static struct file_operations ftraced_fops = {
+	.open = tracing_open_generic,
+	.read = ftraced_read,
+	.write = ftraced_write,
+};
+
+/**
+ * ftrace_force_update - force an update to all recording ftrace functions
+ */
+int ftrace_force_update(void)
+{
+	int ret = 0;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftraced_lock);
+
+	/*
+	 * If ftraced_trigger is not set, then there is nothing
+	 * to update.
+	 */
+	if (ftraced_trigger && !ftrace_update_code())
+		ret = -EBUSY;
+
+	mutex_unlock(&ftraced_lock);
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	return ret;
+}
+
+static void ftrace_force_shutdown(void)
+{
+	struct task_struct *task;
+	int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC;
+
+	mutex_lock(&ftraced_lock);
+	task = ftraced_task;
+	ftraced_task = NULL;
+	ftraced_suspend = -1;
+	ftrace_run_update_code(command);
+	mutex_unlock(&ftraced_lock);
+
+	if (task)
+		kthread_stop(task);
+}
+
+static __init int ftrace_init_debugfs(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("available_filter_functions", 0444,
+				    d_tracer, NULL, &ftrace_avail_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'available_filter_functions' entry\n");
+
+	entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
+				    NULL, &ftrace_filter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_ftrace_filter' entry\n");
+
+	entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+				    NULL, &ftrace_notrace_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_ftrace_notrace' entry\n");
+
+	entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer,
+				    NULL, &ftraced_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'ftraced_enabled' entry\n");
+	return 0;
+}
+
+fs_initcall(ftrace_init_debugfs);
+
+static int __init ftrace_dynamic_init(void)
+{
+	struct task_struct *p;
+	unsigned long addr;
+	int ret;
+
+	addr = (unsigned long)ftrace_record_ip;
+
+	stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
+
+	/* ftrace_dyn_arch_init places the return code in addr */
+	if (addr) {
+		ret = (int)addr;
+		goto failed;
+	}
+
+	ret = ftrace_dyn_table_alloc();
+	if (ret)
+		goto failed;
+
+	p = kthread_run(ftraced, NULL, "ftraced");
+	if (IS_ERR(p)) {
+		ret = -1;
+		goto failed;
+	}
+
+	last_ftrace_enabled = ftrace_enabled = 1;
+	ftraced_task = p;
+
+	return 0;
+
+ failed:
+	ftrace_disabled = 1;
+	return ret;
+}
+
+core_initcall(ftrace_dynamic_init);
+#else
+# define ftrace_startup()		do { } while (0)
+# define ftrace_shutdown()		do { } while (0)
+# define ftrace_startup_sysctl()	do { } while (0)
+# define ftrace_shutdown_sysctl()	do { } while (0)
+# define ftrace_force_shutdown()	do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+/**
+ * ftrace_kill - totally shutdown ftrace
+ *
+ * This is a safety measure. If something was detected that seems
+ * wrong, calling this function will keep ftrace from doing
+ * any more modifications, and updates.
+ * used when something went wrong.
+ */
+void ftrace_kill(void)
+{
+	mutex_lock(&ftrace_sysctl_lock);
+	ftrace_disabled = 1;
+	ftrace_enabled = 0;
+
+	clear_ftrace_function();
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	/* Try to totally disable ftrace */
+	ftrace_force_shutdown();
+}
+
+/**
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
+ *
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ *       with "notrace", otherwise it will go into a
+ *       recursive loop.
+ */
+int register_ftrace_function(struct ftrace_ops *ops)
+{
+	int ret;
+
+	if (unlikely(ftrace_disabled))
+		return -1;
+
+	mutex_lock(&ftrace_sysctl_lock);
+	ret = __register_ftrace_function(ops);
+	ftrace_startup();
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	return ret;
+}
+
+/**
+ * unregister_ftrace_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+	int ret;
+
+	mutex_lock(&ftrace_sysctl_lock);
+	ret = __unregister_ftrace_function(ops);
+	ftrace_shutdown();
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	return ret;
+}
+
+int
+ftrace_enable_sysctl(struct ctl_table *table, int write,
+		     struct file *file, void __user *buffer, size_t *lenp,
+		     loff_t *ppos)
+{
+	int ret;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	mutex_lock(&ftrace_sysctl_lock);
+
+	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
+
+	if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
+		goto out;
+
+	last_ftrace_enabled = ftrace_enabled;
+
+	if (ftrace_enabled) {
+
+		ftrace_startup_sysctl();
+
+		/* we are starting ftrace again */
+		if (ftrace_list != &ftrace_list_end) {
+			if (ftrace_list->next == &ftrace_list_end)
+				ftrace_trace_function = ftrace_list->func;
+			else
+				ftrace_trace_function = ftrace_list_func;
+		}
+
+	} else {
+		/* stopping ftrace calls (just send to ftrace_stub) */
+		ftrace_trace_function = ftrace_stub;
+
+		ftrace_shutdown_sysctl();
+	}
+
+ out:
+	mutex_unlock(&ftrace_sysctl_lock);
+	return ret;
+}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
new file mode 100644
index 000000000000..a864dd292e38
--- /dev/null
+++ b/kernel/trace/trace.c
@@ -0,0 +1,3131 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally taken from the RT patch by:
+ *    Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/utsrelease.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/pagemap.h>
+#include <linux/hardirq.h>
+#include <linux/linkage.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/kprobes.h>
+#include <linux/writeback.h>
+
+#include <linux/stacktrace.h>
+
+#include "trace.h"
+
+unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
+unsigned long __read_mostly	tracing_thresh;
+
+static unsigned long __read_mostly	tracing_nr_buffers;
+static cpumask_t __read_mostly		tracing_buffer_mask;
+
+#define for_each_tracing_cpu(cpu)	\
+	for_each_cpu_mask(cpu, tracing_buffer_mask)
+
+/* dummy trace to disable tracing */
+static struct tracer no_tracer __read_mostly = {
+	.name		= "none",
+};
+
+static int trace_alloc_page(void);
+static int trace_free_page(void);
+
+static int tracing_disabled = 1;
+
+static unsigned long tracing_pages_allocated;
+
+long
+ns2usecs(cycle_t nsec)
+{
+	nsec += 500;
+	do_div(nsec, 1000);
+	return nsec;
+}
+
+cycle_t ftrace_now(int cpu)
+{
+	return cpu_clock(cpu);
+}
+
+/*
+ * The global_trace is the descriptor that holds the tracing
+ * buffers for the live tracing. For each CPU, it contains
+ * a link list of pages that will store trace entries. The
+ * page descriptor of the pages in the memory is used to hold
+ * the link list by linking the lru item in the page descriptor
+ * to each of the pages in the buffer per CPU.
+ *
+ * For each active CPU there is a data field that holds the
+ * pages for the buffer for that CPU. Each CPU has the same number
+ * of pages allocated for its buffer.
+ */
+static struct trace_array	global_trace;
+
+static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
+
+/*
+ * The max_tr is used to snapshot the global_trace when a maximum
+ * latency is reached. Some tracers will use this to store a maximum
+ * trace while it continues examining live traces.
+ *
+ * The buffers for the max_tr are set up the same as the global_trace.
+ * When a snapshot is taken, the link list of the max_tr is swapped
+ * with the link list of the global_trace and the buffers are reset for
+ * the global_trace so the tracing can continue.
+ */
+static struct trace_array	max_tr;
+
+static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
+
+/* tracer_enabled is used to toggle activation of a tracer */
+static int			tracer_enabled = 1;
+
+/*
+ * trace_nr_entries is the number of entries that is allocated
+ * for a buffer. Note, the number of entries is always rounded
+ * to ENTRIES_PER_PAGE.
+ */
+static unsigned long		trace_nr_entries = 65536UL;
+
+/* trace_types holds a link list of available tracers. */
+static struct tracer		*trace_types __read_mostly;
+
+/* current_trace points to the tracer that is currently active */
+static struct tracer		*current_trace __read_mostly;
+
+/*
+ * max_tracer_type_len is used to simplify the allocating of
+ * buffers to read userspace tracer names. We keep track of
+ * the longest tracer name registered.
+ */
+static int			max_tracer_type_len;
+
+/*
+ * trace_types_lock is used to protect the trace_types list.
+ * This lock is also used to keep user access serialized.
+ * Accesses from userspace will grab this lock while userspace
+ * activities happen inside the kernel.
+ */
+static DEFINE_MUTEX(trace_types_lock);
+
+/* trace_wait is a waitqueue for tasks blocked on trace_poll */
+static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
+
+/* trace_flags holds iter_ctrl options */
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+
+/**
+ * trace_wake_up - wake up tasks waiting for trace input
+ *
+ * Simply wakes up any task that is blocked on the trace_wait
+ * queue. These is used with trace_poll for tasks polling the trace.
+ */
+void trace_wake_up(void)
+{
+	/*
+	 * The runqueue_is_locked() can fail, but this is the best we
+	 * have for now:
+	 */
+	if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
+		wake_up(&trace_wait);
+}
+
+#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry))
+
+static int __init set_nr_entries(char *str)
+{
+	unsigned long nr_entries;
+	int ret;
+
+	if (!str)
+		return 0;
+	ret = strict_strtoul(str, 0, &nr_entries);
+	/* nr_entries can not be zero */
+	if (ret < 0 || nr_entries == 0)
+		return 0;
+	trace_nr_entries = nr_entries;
+	return 1;
+}
+__setup("trace_entries=", set_nr_entries);
+
+unsigned long nsecs_to_usecs(unsigned long nsecs)
+{
+	return nsecs / 1000;
+}
+
+/*
+ * trace_flag_type is an enumeration that holds different
+ * states when a trace occurs. These are:
+ *  IRQS_OFF	- interrupts were disabled
+ *  NEED_RESCED - reschedule is requested
+ *  HARDIRQ	- inside an interrupt handler
+ *  SOFTIRQ	- inside a softirq handler
+ */
+enum trace_flag_type {
+	TRACE_FLAG_IRQS_OFF		= 0x01,
+	TRACE_FLAG_NEED_RESCHED		= 0x02,
+	TRACE_FLAG_HARDIRQ		= 0x04,
+	TRACE_FLAG_SOFTIRQ		= 0x08,
+};
+
+/*
+ * TRACE_ITER_SYM_MASK masks the options in trace_flags that
+ * control the output of kernel symbols.
+ */
+#define TRACE_ITER_SYM_MASK \
+	(TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
+
+/* These must match the bit postions in trace_iterator_flags */
+static const char *trace_options[] = {
+	"print-parent",
+	"sym-offset",
+	"sym-addr",
+	"verbose",
+	"raw",
+	"hex",
+	"bin",
+	"block",
+	"stacktrace",
+	"sched-tree",
+	NULL
+};
+
+/*
+ * ftrace_max_lock is used to protect the swapping of buffers
+ * when taking a max snapshot. The buffers themselves are
+ * protected by per_cpu spinlocks. But the action of the swap
+ * needs its own lock.
+ *
+ * This is defined as a raw_spinlock_t in order to help
+ * with performance when lockdep debugging is enabled.
+ */
+static raw_spinlock_t ftrace_max_lock =
+	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /debugfs/tracing/latency_trace)
+ */
+static void
+__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+	struct trace_array_cpu *data = tr->data[cpu];
+
+	max_tr.cpu = cpu;
+	max_tr.time_start = data->preempt_timestamp;
+
+	data = max_tr.data[cpu];
+	data->saved_latency = tracing_max_latency;
+
+	memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
+	data->pid = tsk->pid;
+	data->uid = tsk->uid;
+	data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
+	data->policy = tsk->policy;
+	data->rt_priority = tsk->rt_priority;
+
+	/* record this tasks comm */
+	tracing_record_cmdline(current);
+}
+
+#define CHECK_COND(cond)			\
+	if (unlikely(cond)) {			\
+		tracing_disabled = 1;		\
+		WARN_ON(1);			\
+		return -1;			\
+	}
+
+/**
+ * check_pages - integrity check of trace buffers
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+int check_pages(struct trace_array_cpu *data)
+{
+	struct page *page, *tmp;
+
+	CHECK_COND(data->trace_pages.next->prev != &data->trace_pages);
+	CHECK_COND(data->trace_pages.prev->next != &data->trace_pages);
+
+	list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
+		CHECK_COND(page->lru.next->prev != &page->lru);
+		CHECK_COND(page->lru.prev->next != &page->lru);
+	}
+
+	return 0;
+}
+
+/**
+ * head_page - page address of the first page in per_cpu buffer.
+ *
+ * head_page returns the page address of the first page in
+ * a per_cpu buffer. This also preforms various consistency
+ * checks to make sure the buffer has not been corrupted.
+ */
+void *head_page(struct trace_array_cpu *data)
+{
+	struct page *page;
+
+	if (list_empty(&data->trace_pages))
+		return NULL;
+
+	page = list_entry(data->trace_pages.next, struct page, lru);
+	BUG_ON(&page->lru == &data->trace_pages);
+
+	return page_address(page);
+}
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	va_list ap;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	va_start(ap, fmt);
+	ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+	va_end(ap);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+static int
+trace_seq_puts(struct trace_seq *s, const char *str)
+{
+	int len = strlen(str);
+
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return 0;
+
+	memcpy(s->buffer + s->len, str, len);
+	s->len += len;
+
+	return len;
+}
+
+static int
+trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+	if (s->len >= (PAGE_SIZE - 1))
+		return 0;
+
+	s->buffer[s->len++] = c;
+
+	return 1;
+}
+
+static int
+trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
+{
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return 0;
+
+	memcpy(s->buffer + s->len, mem, len);
+	s->len += len;
+
+	return len;
+}
+
+#define HEX_CHARS 17
+static const char hex2asc[] = "0123456789abcdef";
+
+static int
+trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
+{
+	unsigned char hex[HEX_CHARS];
+	unsigned char *data = mem;
+	unsigned char byte;
+	int i, j;
+
+	BUG_ON(len >= HEX_CHARS);
+
+#ifdef __BIG_ENDIAN
+	for (i = 0, j = 0; i < len; i++) {
+#else
+	for (i = len-1, j = 0; i >= 0; i--) {
+#endif
+		byte = data[i];
+
+		hex[j++] = hex2asc[byte & 0x0f];
+		hex[j++] = hex2asc[byte >> 4];
+	}
+	hex[j++] = ' ';
+
+	return trace_seq_putmem(s, hex, j);
+}
+
+static void
+trace_seq_reset(struct trace_seq *s)
+{
+	s->len = 0;
+	s->readpos = 0;
+}
+
+ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
+{
+	int len;
+	int ret;
+
+	if (s->len <= s->readpos)
+		return -EBUSY;
+
+	len = s->len - s->readpos;
+	if (cnt > len)
+		cnt = len;
+	ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+	if (ret)
+		return -EFAULT;
+
+	s->readpos += len;
+	return cnt;
+}
+
+static void
+trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+
+	s->buffer[len] = 0;
+	seq_puts(m, s->buffer);
+
+	trace_seq_reset(s);
+}
+
+/*
+ * flip the trace buffers between two trace descriptors.
+ * This usually is the buffers between the global_trace and
+ * the max_tr to record a snapshot of a current trace.
+ *
+ * The ftrace_max_lock must be held.
+ */
+static void
+flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
+{
+	struct list_head flip_pages;
+
+	INIT_LIST_HEAD(&flip_pages);
+
+	memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx,
+		sizeof(struct trace_array_cpu) -
+		offsetof(struct trace_array_cpu, trace_head_idx));
+
+	check_pages(tr1);
+	check_pages(tr2);
+	list_splice_init(&tr1->trace_pages, &flip_pages);
+	list_splice_init(&tr2->trace_pages, &tr1->trace_pages);
+	list_splice_init(&flip_pages, &tr2->trace_pages);
+	BUG_ON(!list_empty(&flip_pages));
+	check_pages(tr1);
+	check_pages(tr2);
+}
+
+/**
+ * update_max_tr - snapshot all trace buffers from global_trace to max_tr
+ * @tr: tracer
+ * @tsk: the task with the latency
+ * @cpu: The cpu that initiated the trace.
+ *
+ * Flip the buffers between the @tr and the max_tr and record information
+ * about which task was the cause of this latency.
+ */
+void
+update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+	struct trace_array_cpu *data;
+	int i;
+
+	WARN_ON_ONCE(!irqs_disabled());
+	__raw_spin_lock(&ftrace_max_lock);
+	/* clear out all the previous traces */
+	for_each_tracing_cpu(i) {
+		data = tr->data[i];
+		flip_trace(max_tr.data[i], data);
+		tracing_reset(data);
+	}
+
+	__update_max_tr(tr, tsk, cpu);
+	__raw_spin_unlock(&ftrace_max_lock);
+}
+
+/**
+ * update_max_tr_single - only copy one trace over, and reset the rest
+ * @tr - tracer
+ * @tsk - task with the latency
+ * @cpu - the cpu of the buffer to copy.
+ *
+ * Flip the trace of a single CPU buffer between the @tr and the max_tr.
+ */
+void
+update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+	struct trace_array_cpu *data = tr->data[cpu];
+	int i;
+
+	WARN_ON_ONCE(!irqs_disabled());
+	__raw_spin_lock(&ftrace_max_lock);
+	for_each_tracing_cpu(i)
+		tracing_reset(max_tr.data[i]);
+
+	flip_trace(max_tr.data[cpu], data);
+	tracing_reset(data);
+
+	__update_max_tr(tr, tsk, cpu);
+	__raw_spin_unlock(&ftrace_max_lock);
+}
+
+/**
+ * register_tracer - register a tracer with the ftrace system.
+ * @type - the plugin for the tracer
+ *
+ * Register a new plugin tracer.
+ */
+int register_tracer(struct tracer *type)
+{
+	struct tracer *t;
+	int len;
+	int ret = 0;
+
+	if (!type->name) {
+		pr_info("Tracer must have a name\n");
+		return -1;
+	}
+
+	mutex_lock(&trace_types_lock);
+	for (t = trace_types; t; t = t->next) {
+		if (strcmp(type->name, t->name) == 0) {
+			/* already found */
+			pr_info("Trace %s already registered\n",
+				type->name);
+			ret = -1;
+			goto out;
+		}
+	}
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+	if (type->selftest) {
+		struct tracer *saved_tracer = current_trace;
+		struct trace_array_cpu *data;
+		struct trace_array *tr = &global_trace;
+		int saved_ctrl = tr->ctrl;
+		int i;
+		/*
+		 * Run a selftest on this tracer.
+		 * Here we reset the trace buffer, and set the current
+		 * tracer to be this tracer. The tracer can then run some
+		 * internal tracing to verify that everything is in order.
+		 * If we fail, we do not register this tracer.
+		 */
+		for_each_tracing_cpu(i) {
+			data = tr->data[i];
+			if (!head_page(data))
+				continue;
+			tracing_reset(data);
+		}
+		current_trace = type;
+		tr->ctrl = 0;
+		/* the test is responsible for initializing and enabling */
+		pr_info("Testing tracer %s: ", type->name);
+		ret = type->selftest(type, tr);
+		/* the test is responsible for resetting too */
+		current_trace = saved_tracer;
+		tr->ctrl = saved_ctrl;
+		if (ret) {
+			printk(KERN_CONT "FAILED!\n");
+			goto out;
+		}
+		/* Only reset on passing, to avoid touching corrupted buffers */
+		for_each_tracing_cpu(i) {
+			data = tr->data[i];
+			if (!head_page(data))
+				continue;
+			tracing_reset(data);
+		}
+		printk(KERN_CONT "PASSED\n");
+	}
+#endif
+
+	type->next = trace_types;
+	trace_types = type;
+	len = strlen(type->name);
+	if (len > max_tracer_type_len)
+		max_tracer_type_len = len;
+
+ out:
+	mutex_unlock(&trace_types_lock);
+
+	return ret;
+}
+
+void unregister_tracer(struct tracer *type)
+{
+	struct tracer **t;
+	int len;
+
+	mutex_lock(&trace_types_lock);
+	for (t = &trace_types; *t; t = &(*t)->next) {
+		if (*t == type)
+			goto found;
+	}
+	pr_info("Trace %s not registered\n", type->name);
+	goto out;
+
+ found:
+	*t = (*t)->next;
+	if (strlen(type->name) != max_tracer_type_len)
+		goto out;
+
+	max_tracer_type_len = 0;
+	for (t = &trace_types; *t; t = &(*t)->next) {
+		len = strlen((*t)->name);
+		if (len > max_tracer_type_len)
+			max_tracer_type_len = len;
+	}
+ out:
+	mutex_unlock(&trace_types_lock);
+}
+
+void tracing_reset(struct trace_array_cpu *data)
+{
+	data->trace_idx = 0;
+	data->overrun = 0;
+	data->trace_head = data->trace_tail = head_page(data);
+	data->trace_head_idx = 0;
+	data->trace_tail_idx = 0;
+}
+
+#define SAVED_CMDLINES 128
+static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
+static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
+static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
+static int cmdline_idx;
+static DEFINE_SPINLOCK(trace_cmdline_lock);
+
+/* temporary disable recording */
+atomic_t trace_record_cmdline_disabled __read_mostly;
+
+static void trace_init_cmdlines(void)
+{
+	memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline));
+	memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid));
+	cmdline_idx = 0;
+}
+
+void trace_stop_cmdline_recording(void);
+
+static void trace_save_cmdline(struct task_struct *tsk)
+{
+	unsigned map;
+	unsigned idx;
+
+	if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
+		return;
+
+	/*
+	 * It's not the end of the world if we don't get
+	 * the lock, but we also don't want to spin
+	 * nor do we want to disable interrupts,
+	 * so if we miss here, then better luck next time.
+	 */
+	if (!spin_trylock(&trace_cmdline_lock))
+		return;
+
+	idx = map_pid_to_cmdline[tsk->pid];
+	if (idx >= SAVED_CMDLINES) {
+		idx = (cmdline_idx + 1) % SAVED_CMDLINES;
+
+		map = map_cmdline_to_pid[idx];
+		if (map <= PID_MAX_DEFAULT)
+			map_pid_to_cmdline[map] = (unsigned)-1;
+
+		map_pid_to_cmdline[tsk->pid] = idx;
+
+		cmdline_idx = idx;
+	}
+
+	memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
+
+	spin_unlock(&trace_cmdline_lock);
+}
+
+static char *trace_find_cmdline(int pid)
+{
+	char *cmdline = "<...>";
+	unsigned map;
+
+	if (!pid)
+		return "<idle>";
+
+	if (pid > PID_MAX_DEFAULT)
+		goto out;
+
+	map = map_pid_to_cmdline[pid];
+	if (map >= SAVED_CMDLINES)
+		goto out;
+
+	cmdline = saved_cmdlines[map];
+
+ out:
+	return cmdline;
+}
+
+void tracing_record_cmdline(struct task_struct *tsk)
+{
+	if (atomic_read(&trace_record_cmdline_disabled))
+		return;
+
+	trace_save_cmdline(tsk);
+}
+
+static inline struct list_head *
+trace_next_list(struct trace_array_cpu *data, struct list_head *next)
+{
+	/*
+	 * Roundrobin - but skip the head (which is not a real page):
+	 */
+	next = next->next;
+	if (unlikely(next == &data->trace_pages))
+		next = next->next;
+	BUG_ON(next == &data->trace_pages);
+
+	return next;
+}
+
+static inline void *
+trace_next_page(struct trace_array_cpu *data, void *addr)
+{
+	struct list_head *next;
+	struct page *page;
+
+	page = virt_to_page(addr);
+
+	next = trace_next_list(data, &page->lru);
+	page = list_entry(next, struct page, lru);
+
+	return page_address(page);
+}
+
+static inline struct trace_entry *
+tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
+{
+	unsigned long idx, idx_next;
+	struct trace_entry *entry;
+
+	data->trace_idx++;
+	idx = data->trace_head_idx;
+	idx_next = idx + 1;
+
+	BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
+
+	entry = data->trace_head + idx * TRACE_ENTRY_SIZE;
+
+	if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
+		data->trace_head = trace_next_page(data, data->trace_head);
+		idx_next = 0;
+	}
+
+	if (data->trace_head == data->trace_tail &&
+	    idx_next == data->trace_tail_idx) {
+		/* overrun */
+		data->overrun++;
+		data->trace_tail_idx++;
+		if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
+			data->trace_tail =
+				trace_next_page(data, data->trace_tail);
+			data->trace_tail_idx = 0;
+		}
+	}
+
+	data->trace_head_idx = idx_next;
+
+	return entry;
+}
+
+static inline void
+tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
+{
+	struct task_struct *tsk = current;
+	unsigned long pc;
+
+	pc = preempt_count();
+
+	entry->preempt_count	= pc & 0xff;
+	entry->pid		= (tsk) ? tsk->pid : 0;
+	entry->t		= ftrace_now(raw_smp_processor_id());
+	entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+}
+
+void
+trace_function(struct trace_array *tr, struct trace_array_cpu *data,
+	       unsigned long ip, unsigned long parent_ip, unsigned long flags)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_FN;
+	entry->fn.ip		= ip;
+	entry->fn.parent_ip	= parent_ip;
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+}
+
+void
+ftrace(struct trace_array *tr, struct trace_array_cpu *data,
+       unsigned long ip, unsigned long parent_ip, unsigned long flags)
+{
+	if (likely(!atomic_read(&data->disabled)))
+		trace_function(tr, data, ip, parent_ip, flags);
+}
+
+#ifdef CONFIG_MMIOTRACE
+void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data,
+						struct mmiotrace_rw *rw)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type		= TRACE_MMIO_RW;
+	entry->mmiorw		= *rw;
+
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+
+	trace_wake_up();
+}
+
+void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
+						struct mmiotrace_map *map)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type		= TRACE_MMIO_MAP;
+	entry->mmiomap		= *map;
+
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+
+	trace_wake_up();
+}
+#endif
+
+void __trace_stack(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long flags,
+		   int skip)
+{
+	struct trace_entry *entry;
+	struct stack_trace trace;
+
+	if (!(trace_flags & TRACE_ITER_STACKTRACE))
+		return;
+
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_STACK;
+
+	memset(&entry->stack, 0, sizeof(entry->stack));
+
+	trace.nr_entries	= 0;
+	trace.max_entries	= FTRACE_STACK_ENTRIES;
+	trace.skip		= skip;
+	trace.entries		= entry->stack.caller;
+
+	save_stack_trace(&trace);
+}
+
+void
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	struct trace_array_cpu *data = __data;
+	struct trace_array *tr = __tr;
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type		= TRACE_SPECIAL;
+	entry->special.arg1	= arg1;
+	entry->special.arg2	= arg2;
+	entry->special.arg3	= arg3;
+	__trace_stack(tr, data, irq_flags, 4);
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+
+	trace_wake_up();
+}
+
+void
+tracing_sched_switch_trace(struct trace_array *tr,
+			   struct trace_array_cpu *data,
+			   struct task_struct *prev,
+			   struct task_struct *next,
+			   unsigned long flags)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_CTX;
+	entry->ctx.prev_pid	= prev->pid;
+	entry->ctx.prev_prio	= prev->prio;
+	entry->ctx.prev_state	= prev->state;
+	entry->ctx.next_pid	= next->pid;
+	entry->ctx.next_prio	= next->prio;
+	entry->ctx.next_state	= next->state;
+	__trace_stack(tr, data, flags, 5);
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+}
+
+void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+			   struct trace_array_cpu *data,
+			   struct task_struct *wakee,
+			   struct task_struct *curr,
+			   unsigned long flags)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_WAKE;
+	entry->ctx.prev_pid	= curr->pid;
+	entry->ctx.prev_prio	= curr->prio;
+	entry->ctx.prev_state	= curr->state;
+	entry->ctx.next_pid	= wakee->pid;
+	entry->ctx.next_prio	= wakee->prio;
+	entry->ctx.next_state	= wakee->state;
+	__trace_stack(tr, data, flags, 6);
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+
+	trace_wake_up();
+}
+
+void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl)
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		__trace_special(tr, data, arg1, arg2, arg3);
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+#ifdef CONFIG_FTRACE
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (unlikely(!tracer_enabled))
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		trace_function(tr, data, ip, parent_ip, flags);
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = function_trace_call,
+};
+
+void tracing_start_function_trace(void)
+{
+	register_ftrace_function(&trace_ops);
+}
+
+void tracing_stop_function_trace(void)
+{
+	unregister_ftrace_function(&trace_ops);
+}
+#endif
+
+enum trace_file_type {
+	TRACE_FILE_LAT_FMT	= 1,
+};
+
+static struct trace_entry *
+trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
+		struct trace_iterator *iter, int cpu)
+{
+	struct page *page;
+	struct trace_entry *array;
+
+	if (iter->next_idx[cpu] >= tr->entries ||
+	    iter->next_idx[cpu] >= data->trace_idx ||
+	    (data->trace_head == data->trace_tail &&
+	     data->trace_head_idx == data->trace_tail_idx))
+		return NULL;
+
+	if (!iter->next_page[cpu]) {
+		/* Initialize the iterator for this cpu trace buffer */
+		WARN_ON(!data->trace_tail);
+		page = virt_to_page(data->trace_tail);
+		iter->next_page[cpu] = &page->lru;
+		iter->next_page_idx[cpu] = data->trace_tail_idx;
+	}
+
+	page = list_entry(iter->next_page[cpu], struct page, lru);
+	BUG_ON(&data->trace_pages == &page->lru);
+
+	array = page_address(page);
+
+	WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE);
+	return &array[iter->next_page_idx[cpu]];
+}
+
+static struct trace_entry *
+find_next_entry(struct trace_iterator *iter, int *ent_cpu)
+{
+	struct trace_array *tr = iter->tr;
+	struct trace_entry *ent, *next = NULL;
+	int next_cpu = -1;
+	int cpu;
+
+	for_each_tracing_cpu(cpu) {
+		if (!head_page(tr->data[cpu]))
+			continue;
+		ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
+		/*
+		 * Pick the entry with the smallest timestamp:
+		 */
+		if (ent && (!next || ent->t < next->t)) {
+			next = ent;
+			next_cpu = cpu;
+		}
+	}
+
+	if (ent_cpu)
+		*ent_cpu = next_cpu;
+
+	return next;
+}
+
+static void trace_iterator_increment(struct trace_iterator *iter)
+{
+	iter->idx++;
+	iter->next_idx[iter->cpu]++;
+	iter->next_page_idx[iter->cpu]++;
+
+	if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
+		struct trace_array_cpu *data = iter->tr->data[iter->cpu];
+
+		iter->next_page_idx[iter->cpu] = 0;
+		iter->next_page[iter->cpu] =
+			trace_next_list(data, iter->next_page[iter->cpu]);
+	}
+}
+
+static void trace_consume(struct trace_iterator *iter)
+{
+	struct trace_array_cpu *data = iter->tr->data[iter->cpu];
+
+	data->trace_tail_idx++;
+	if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
+		data->trace_tail = trace_next_page(data, data->trace_tail);
+		data->trace_tail_idx = 0;
+	}
+
+	/* Check if we empty it, then reset the index */
+	if (data->trace_head == data->trace_tail &&
+	    data->trace_head_idx == data->trace_tail_idx)
+		data->trace_idx = 0;
+}
+
+static void *find_next_entry_inc(struct trace_iterator *iter)
+{
+	struct trace_entry *next;
+	int next_cpu = -1;
+
+	next = find_next_entry(iter, &next_cpu);
+
+	iter->prev_ent = iter->ent;
+	iter->prev_cpu = iter->cpu;
+
+	iter->ent = next;
+	iter->cpu = next_cpu;
+
+	if (next)
+		trace_iterator_increment(iter);
+
+	return next ? iter : NULL;
+}
+
+static void *s_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct trace_iterator *iter = m->private;
+	void *last_ent = iter->ent;
+	int i = (int)*pos;
+	void *ent;
+
+	(*pos)++;
+
+	/* can't go backwards */
+	if (iter->idx > i)
+		return NULL;
+
+	if (iter->idx < 0)
+		ent = find_next_entry_inc(iter);
+	else
+		ent = iter;
+
+	while (ent && iter->idx < i)
+		ent = find_next_entry_inc(iter);
+
+	iter->pos = *pos;
+
+	if (last_ent && !ent)
+		seq_puts(m, "\n\nvim:ft=help\n");
+
+	return ent;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	struct trace_iterator *iter = m->private;
+	void *p = NULL;
+	loff_t l = 0;
+	int i;
+
+	mutex_lock(&trace_types_lock);
+
+	if (!current_trace || current_trace != iter->trace) {
+		mutex_unlock(&trace_types_lock);
+		return NULL;
+	}
+
+	atomic_inc(&trace_record_cmdline_disabled);
+
+	/* let the tracer grab locks here if needed */
+	if (current_trace->start)
+		current_trace->start(iter);
+
+	if (*pos != iter->pos) {
+		iter->ent = NULL;
+		iter->cpu = 0;
+		iter->idx = -1;
+		iter->prev_ent = NULL;
+		iter->prev_cpu = -1;
+
+		for_each_tracing_cpu(i) {
+			iter->next_idx[i] = 0;
+			iter->next_page[i] = NULL;
+		}
+
+		for (p = iter; p && l < *pos; p = s_next(m, p, &l))
+			;
+
+	} else {
+		l = *pos - 1;
+		p = s_next(m, p, &l);
+	}
+
+	return p;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+	struct trace_iterator *iter = m->private;
+
+	atomic_dec(&trace_record_cmdline_disabled);
+
+	/* let the tracer release locks here if needed */
+	if (current_trace && current_trace == iter->trace && iter->trace->stop)
+		iter->trace->stop(iter);
+
+	mutex_unlock(&trace_types_lock);
+}
+
+#define KRETPROBE_MSG "[unknown/kretprobe'd]"
+
+#ifdef CONFIG_KRETPROBES
+static inline int kretprobed(unsigned long addr)
+{
+	return addr == (unsigned long)kretprobe_trampoline;
+}
+#else
+static inline int kretprobed(unsigned long addr)
+{
+	return 0;
+}
+#endif /* CONFIG_KRETPROBES */
+
+static int
+seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
+
+	kallsyms_lookup(address, NULL, NULL, NULL, str);
+
+	return trace_seq_printf(s, fmt, str);
+#endif
+	return 1;
+}
+
+static int
+seq_print_sym_offset(struct trace_seq *s, const char *fmt,
+		     unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
+
+	sprint_symbol(str, address);
+	return trace_seq_printf(s, fmt, str);
+#endif
+	return 1;
+}
+
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
+static int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
+{
+	int ret;
+
+	if (!ip)
+		return trace_seq_printf(s, "0");
+
+	if (sym_flags & TRACE_ITER_SYM_OFFSET)
+		ret = seq_print_sym_offset(s, "%s", ip);
+	else
+		ret = seq_print_sym_short(s, "%s", ip);
+
+	if (!ret)
+		return 0;
+
+	if (sym_flags & TRACE_ITER_SYM_ADDR)
+		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+	return ret;
+}
+
+static void print_lat_help_header(struct seq_file *m)
+{
+	seq_puts(m, "#                _------=> CPU#            \n");
+	seq_puts(m, "#               / _-----=> irqs-off        \n");
+	seq_puts(m, "#              | / _----=> need-resched    \n");
+	seq_puts(m, "#              || / _---=> hardirq/softirq \n");
+	seq_puts(m, "#              ||| / _--=> preempt-depth   \n");
+	seq_puts(m, "#              |||| /                      \n");
+	seq_puts(m, "#              |||||     delay             \n");
+	seq_puts(m, "#  cmd     pid ||||| time  |   caller      \n");
+	seq_puts(m, "#     \\   /    |||||   \\   |   /           \n");
+}
+
+static void print_func_help_header(struct seq_file *m)
+{
+	seq_puts(m, "#           TASK-PID   CPU#    TIMESTAMP  FUNCTION\n");
+	seq_puts(m, "#              | |      |          |         |\n");
+}
+
+
+static void
+print_trace_header(struct seq_file *m, struct trace_iterator *iter)
+{
+	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+	struct trace_array *tr = iter->tr;
+	struct trace_array_cpu *data = tr->data[tr->cpu];
+	struct tracer *type = current_trace;
+	unsigned long total   = 0;
+	unsigned long entries = 0;
+	int cpu;
+	const char *name = "preemption";
+
+	if (type)
+		name = type->name;
+
+	for_each_tracing_cpu(cpu) {
+		if (head_page(tr->data[cpu])) {
+			total += tr->data[cpu]->trace_idx;
+			if (tr->data[cpu]->trace_idx > tr->entries)
+				entries += tr->entries;
+			else
+				entries += tr->data[cpu]->trace_idx;
+		}
+	}
+
+	seq_printf(m, "%s latency trace v1.1.5 on %s\n",
+		   name, UTS_RELEASE);
+	seq_puts(m, "-----------------------------------"
+		 "---------------------------------\n");
+	seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
+		   " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
+		   nsecs_to_usecs(data->saved_latency),
+		   entries,
+		   total,
+		   tr->cpu,
+#if defined(CONFIG_PREEMPT_NONE)
+		   "server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+		   "desktop",
+#elif defined(CONFIG_PREEMPT_DESKTOP)
+		   "preempt",
+#else
+		   "unknown",
+#endif
+		   /* These are reserved for later use */
+		   0, 0, 0, 0);
+#ifdef CONFIG_SMP
+	seq_printf(m, " #P:%d)\n", num_online_cpus());
+#else
+	seq_puts(m, ")\n");
+#endif
+	seq_puts(m, "    -----------------\n");
+	seq_printf(m, "    | task: %.16s-%d "
+		   "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
+		   data->comm, data->pid, data->uid, data->nice,
+		   data->policy, data->rt_priority);
+	seq_puts(m, "    -----------------\n");
+
+	if (data->critical_start) {
+		seq_puts(m, " => started at: ");
+		seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
+		trace_print_seq(m, &iter->seq);
+		seq_puts(m, "\n => ended at:   ");
+		seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
+		trace_print_seq(m, &iter->seq);
+		seq_puts(m, "\n");
+	}
+
+	seq_puts(m, "\n");
+}
+
+static void
+lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
+{
+	int hardirq, softirq;
+	char *comm;
+
+	comm = trace_find_cmdline(entry->pid);
+
+	trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
+	trace_seq_printf(s, "%d", cpu);
+	trace_seq_printf(s, "%c%c",
+			(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
+			((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq) {
+		trace_seq_putc(s, 'H');
+	} else {
+		if (hardirq) {
+			trace_seq_putc(s, 'h');
+		} else {
+			if (softirq)
+				trace_seq_putc(s, 's');
+			else
+				trace_seq_putc(s, '.');
+		}
+	}
+
+	if (entry->preempt_count)
+		trace_seq_printf(s, "%x", entry->preempt_count);
+	else
+		trace_seq_puts(s, ".");
+}
+
+unsigned long preempt_mark_thresh = 100;
+
+static void
+lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
+		    unsigned long rel_usecs)
+{
+	trace_seq_printf(s, " %4lldus", abs_usecs);
+	if (rel_usecs > preempt_mark_thresh)
+		trace_seq_puts(s, "!: ");
+	else if (rel_usecs > 1)
+		trace_seq_puts(s, "+: ");
+	else
+		trace_seq_puts(s, " : ");
+}
+
+static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+
+static int
+print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
+{
+	struct trace_seq *s = &iter->seq;
+	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+	struct trace_entry *next_entry = find_next_entry(iter, NULL);
+	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
+	struct trace_entry *entry = iter->ent;
+	unsigned long abs_usecs;
+	unsigned long rel_usecs;
+	char *comm;
+	int S, T;
+	int i;
+	unsigned state;
+
+	if (!next_entry)
+		next_entry = entry;
+	rel_usecs = ns2usecs(next_entry->t - entry->t);
+	abs_usecs = ns2usecs(entry->t - iter->tr->time_start);
+
+	if (verbose) {
+		comm = trace_find_cmdline(entry->pid);
+		trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]"
+				 " %ld.%03ldms (+%ld.%03ldms): ",
+				 comm,
+				 entry->pid, cpu, entry->flags,
+				 entry->preempt_count, trace_idx,
+				 ns2usecs(entry->t),
+				 abs_usecs/1000,
+				 abs_usecs % 1000, rel_usecs/1000,
+				 rel_usecs % 1000);
+	} else {
+		lat_print_generic(s, entry, cpu);
+		lat_print_timestamp(s, abs_usecs, rel_usecs);
+	}
+	switch (entry->type) {
+	case TRACE_FN:
+		seq_print_ip_sym(s, entry->fn.ip, sym_flags);
+		trace_seq_puts(s, " (");
+		if (kretprobed(entry->fn.parent_ip))
+			trace_seq_puts(s, KRETPROBE_MSG);
+		else
+			seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
+		trace_seq_puts(s, ")\n");
+		break;
+	case TRACE_CTX:
+	case TRACE_WAKE:
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
+
+		state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0;
+		S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
+		comm = trace_find_cmdline(entry->ctx.next_pid);
+		trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n",
+				 entry->ctx.prev_pid,
+				 entry->ctx.prev_prio,
+				 S, entry->type == TRACE_CTX ? "==>" : "  +",
+				 entry->ctx.next_pid,
+				 entry->ctx.next_prio,
+				 T, comm);
+		break;
+	case TRACE_SPECIAL:
+		trace_seq_printf(s, "# %ld %ld %ld\n",
+				 entry->special.arg1,
+				 entry->special.arg2,
+				 entry->special.arg3);
+		break;
+	case TRACE_STACK:
+		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+			if (i)
+				trace_seq_puts(s, " <= ");
+			seq_print_ip_sym(s, entry->stack.caller[i], sym_flags);
+		}
+		trace_seq_puts(s, "\n");
+		break;
+	default:
+		trace_seq_printf(s, "Unknown type %d\n", entry->type);
+	}
+	return 1;
+}
+
+static int print_trace_fmt(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+	struct trace_entry *entry;
+	unsigned long usec_rem;
+	unsigned long long t;
+	unsigned long secs;
+	char *comm;
+	int ret;
+	int S, T;
+	int i;
+
+	entry = iter->ent;
+
+	comm = trace_find_cmdline(iter->ent->pid);
+
+	t = ns2usecs(entry->t);
+	usec_rem = do_div(t, 1000000ULL);
+	secs = (unsigned long)t;
+
+	ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+	if (!ret)
+		return 0;
+	ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
+	if (!ret)
+		return 0;
+	ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
+	if (!ret)
+		return 0;
+
+	switch (entry->type) {
+	case TRACE_FN:
+		ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags);
+		if (!ret)
+			return 0;
+		if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
+						entry->fn.parent_ip) {
+			ret = trace_seq_printf(s, " <-");
+			if (!ret)
+				return 0;
+			if (kretprobed(entry->fn.parent_ip))
+				ret = trace_seq_puts(s, KRETPROBE_MSG);
+			else
+				ret = seq_print_ip_sym(s, entry->fn.parent_ip,
+						       sym_flags);
+			if (!ret)
+				return 0;
+		}
+		ret = trace_seq_printf(s, "\n");
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_CTX:
+	case TRACE_WAKE:
+		S = entry->ctx.prev_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.prev_state] : 'X';
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
+		ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n",
+				       entry->ctx.prev_pid,
+				       entry->ctx.prev_prio,
+				       S,
+				       entry->type == TRACE_CTX ? "==>" : "  +",
+				       entry->ctx.next_pid,
+				       entry->ctx.next_prio,
+				       T);
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_SPECIAL:
+		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
+				 entry->special.arg1,
+				 entry->special.arg2,
+				 entry->special.arg3);
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_STACK:
+		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+			if (i) {
+				ret = trace_seq_puts(s, " <= ");
+				if (!ret)
+					return 0;
+			}
+			ret = seq_print_ip_sym(s, entry->stack.caller[i],
+					       sym_flags);
+			if (!ret)
+				return 0;
+		}
+		ret = trace_seq_puts(s, "\n");
+		if (!ret)
+			return 0;
+		break;
+	}
+	return 1;
+}
+
+static int print_raw_fmt(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry;
+	int ret;
+	int S, T;
+
+	entry = iter->ent;
+
+	ret = trace_seq_printf(s, "%d %d %llu ",
+		entry->pid, iter->cpu, entry->t);
+	if (!ret)
+		return 0;
+
+	switch (entry->type) {
+	case TRACE_FN:
+		ret = trace_seq_printf(s, "%x %x\n",
+					entry->fn.ip, entry->fn.parent_ip);
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_CTX:
+	case TRACE_WAKE:
+		S = entry->ctx.prev_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.prev_state] : 'X';
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
+		if (entry->type == TRACE_WAKE)
+			S = '+';
+		ret = trace_seq_printf(s, "%d %d %c %d %d %c\n",
+				       entry->ctx.prev_pid,
+				       entry->ctx.prev_prio,
+				       S,
+				       entry->ctx.next_pid,
+				       entry->ctx.next_prio,
+				       T);
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_SPECIAL:
+	case TRACE_STACK:
+		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
+				 entry->special.arg1,
+				 entry->special.arg2,
+				 entry->special.arg3);
+		if (!ret)
+			return 0;
+		break;
+	}
+	return 1;
+}
+
+#define SEQ_PUT_FIELD_RET(s, x)				\
+do {							\
+	if (!trace_seq_putmem(s, &(x), sizeof(x)))	\
+		return 0;				\
+} while (0)
+
+#define SEQ_PUT_HEX_FIELD_RET(s, x)			\
+do {							\
+	if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))	\
+		return 0;				\
+} while (0)
+
+static int print_hex_fmt(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	unsigned char newline = '\n';
+	struct trace_entry *entry;
+	int S, T;
+
+	entry = iter->ent;
+
+	SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
+	SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
+	SEQ_PUT_HEX_FIELD_RET(s, entry->t);
+
+	switch (entry->type) {
+	case TRACE_FN:
+		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
+		break;
+	case TRACE_CTX:
+	case TRACE_WAKE:
+		S = entry->ctx.prev_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.prev_state] : 'X';
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
+		if (entry->type == TRACE_WAKE)
+			S = '+';
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio);
+		SEQ_PUT_HEX_FIELD_RET(s, S);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
+		SEQ_PUT_HEX_FIELD_RET(s, T);
+		break;
+	case TRACE_SPECIAL:
+	case TRACE_STACK:
+		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3);
+		break;
+	}
+	SEQ_PUT_FIELD_RET(s, newline);
+
+	return 1;
+}
+
+static int print_bin_fmt(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry;
+
+	entry = iter->ent;
+
+	SEQ_PUT_FIELD_RET(s, entry->pid);
+	SEQ_PUT_FIELD_RET(s, entry->cpu);
+	SEQ_PUT_FIELD_RET(s, entry->t);
+
+	switch (entry->type) {
+	case TRACE_FN:
+		SEQ_PUT_FIELD_RET(s, entry->fn.ip);
+		SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip);
+		break;
+	case TRACE_CTX:
+		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.next_state);
+		break;
+	case TRACE_SPECIAL:
+	case TRACE_STACK:
+		SEQ_PUT_FIELD_RET(s, entry->special.arg1);
+		SEQ_PUT_FIELD_RET(s, entry->special.arg2);
+		SEQ_PUT_FIELD_RET(s, entry->special.arg3);
+		break;
+	}
+	return 1;
+}
+
+static int trace_empty(struct trace_iterator *iter)
+{
+	struct trace_array_cpu *data;
+	int cpu;
+
+	for_each_tracing_cpu(cpu) {
+		data = iter->tr->data[cpu];
+
+		if (head_page(data) && data->trace_idx &&
+		    (data->trace_tail != data->trace_head ||
+		     data->trace_tail_idx != data->trace_head_idx))
+			return 0;
+	}
+	return 1;
+}
+
+static int print_trace_line(struct trace_iterator *iter)
+{
+	if (iter->trace && iter->trace->print_line)
+		return iter->trace->print_line(iter);
+
+	if (trace_flags & TRACE_ITER_BIN)
+		return print_bin_fmt(iter);
+
+	if (trace_flags & TRACE_ITER_HEX)
+		return print_hex_fmt(iter);
+
+	if (trace_flags & TRACE_ITER_RAW)
+		return print_raw_fmt(iter);
+
+	if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+		return print_lat_fmt(iter, iter->idx, iter->cpu);
+
+	return print_trace_fmt(iter);
+}
+
+static int s_show(struct seq_file *m, void *v)
+{
+	struct trace_iterator *iter = v;
+
+	if (iter->ent == NULL) {
+		if (iter->tr) {
+			seq_printf(m, "# tracer: %s\n", iter->trace->name);
+			seq_puts(m, "#\n");
+		}
+		if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+			/* print nothing if the buffers are empty */
+			if (trace_empty(iter))
+				return 0;
+			print_trace_header(m, iter);
+			if (!(trace_flags & TRACE_ITER_VERBOSE))
+				print_lat_help_header(m);
+		} else {
+			if (!(trace_flags & TRACE_ITER_VERBOSE))
+				print_func_help_header(m);
+		}
+	} else {
+		print_trace_line(iter);
+		trace_print_seq(m, &iter->seq);
+	}
+
+	return 0;
+}
+
+static struct seq_operations tracer_seq_ops = {
+	.start		= s_start,
+	.next		= s_next,
+	.stop		= s_stop,
+	.show		= s_show,
+};
+
+static struct trace_iterator *
+__tracing_open(struct inode *inode, struct file *file, int *ret)
+{
+	struct trace_iterator *iter;
+
+	if (tracing_disabled) {
+		*ret = -ENODEV;
+		return NULL;
+	}
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter) {
+		*ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock(&trace_types_lock);
+	if (current_trace && current_trace->print_max)
+		iter->tr = &max_tr;
+	else
+		iter->tr = inode->i_private;
+	iter->trace = current_trace;
+	iter->pos = -1;
+
+	/* TODO stop tracer */
+	*ret = seq_open(file, &tracer_seq_ops);
+	if (!*ret) {
+		struct seq_file *m = file->private_data;
+		m->private = iter;
+
+		/* stop the trace while dumping */
+		if (iter->tr->ctrl)
+			tracer_enabled = 0;
+
+		if (iter->trace && iter->trace->open)
+			iter->trace->open(iter);
+	} else {
+		kfree(iter);
+		iter = NULL;
+	}
+	mutex_unlock(&trace_types_lock);
+
+ out:
+	return iter;
+}
+
+int tracing_open_generic(struct inode *inode, struct file *filp)
+{
+	if (tracing_disabled)
+		return -ENODEV;
+
+	filp->private_data = inode->i_private;
+	return 0;
+}
+
+int tracing_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct trace_iterator *iter = m->private;
+
+	mutex_lock(&trace_types_lock);
+	if (iter->trace && iter->trace->close)
+		iter->trace->close(iter);
+
+	/* reenable tracing if it was previously enabled */
+	if (iter->tr->ctrl)
+		tracer_enabled = 1;
+	mutex_unlock(&trace_types_lock);
+
+	seq_release(inode, file);
+	kfree(iter);
+	return 0;
+}
+
+static int tracing_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	__tracing_open(inode, file, &ret);
+
+	return ret;
+}
+
+static int tracing_lt_open(struct inode *inode, struct file *file)
+{
+	struct trace_iterator *iter;
+	int ret;
+
+	iter = __tracing_open(inode, file, &ret);
+
+	if (!ret)
+		iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
+	return ret;
+}
+
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct tracer *t = m->private;
+
+	(*pos)++;
+
+	if (t)
+		t = t->next;
+
+	m->private = t;
+
+	return t;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	struct tracer *t = m->private;
+	loff_t l = 0;
+
+	mutex_lock(&trace_types_lock);
+	for (; t && l < *pos; t = t_next(m, t, &l))
+		;
+
+	return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&trace_types_lock);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct tracer *t = v;
+
+	if (!t)
+		return 0;
+
+	seq_printf(m, "%s", t->name);
+	if (t->next)
+		seq_putc(m, ' ');
+	else
+		seq_putc(m, '\n');
+
+	return 0;
+}
+
+static struct seq_operations show_traces_seq_ops = {
+	.start		= t_start,
+	.next		= t_next,
+	.stop		= t_stop,
+	.show		= t_show,
+};
+
+static int show_traces_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	if (tracing_disabled)
+		return -ENODEV;
+
+	ret = seq_open(file, &show_traces_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = trace_types;
+	}
+
+	return ret;
+}
+
+static struct file_operations tracing_fops = {
+	.open		= tracing_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= tracing_release,
+};
+
+static struct file_operations tracing_lt_fops = {
+	.open		= tracing_lt_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= tracing_release,
+};
+
+static struct file_operations show_traces_fops = {
+	.open		= show_traces_open,
+	.read		= seq_read,
+	.release	= seq_release,
+};
+
+/*
+ * Only trace on a CPU if the bitmask is set:
+ */
+static cpumask_t tracing_cpumask = CPU_MASK_ALL;
+
+/*
+ * When tracing/tracing_cpu_mask is modified then this holds
+ * the new bitmask we are about to install:
+ */
+static cpumask_t tracing_cpumask_new;
+
+/*
+ * The tracer itself will not take this lock, but still we want
+ * to provide a consistent cpumask to user-space:
+ */
+static DEFINE_MUTEX(tracing_cpumask_update_lock);
+
+/*
+ * Temporary storage for the character representation of the
+ * CPU bitmask (and one more byte for the newline):
+ */
+static char mask_str[NR_CPUS + 1];
+
+static ssize_t
+tracing_cpumask_read(struct file *filp, char __user *ubuf,
+		     size_t count, loff_t *ppos)
+{
+	int len;
+
+	mutex_lock(&tracing_cpumask_update_lock);
+
+	len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+	if (count - len < 2) {
+		count = -EINVAL;
+		goto out_err;
+	}
+	len += sprintf(mask_str + len, "\n");
+	count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
+
+out_err:
+	mutex_unlock(&tracing_cpumask_update_lock);
+
+	return count;
+}
+
+static ssize_t
+tracing_cpumask_write(struct file *filp, const char __user *ubuf,
+		      size_t count, loff_t *ppos)
+{
+	int err, cpu;
+
+	mutex_lock(&tracing_cpumask_update_lock);
+	err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+	if (err)
+		goto err_unlock;
+
+	raw_local_irq_disable();
+	__raw_spin_lock(&ftrace_max_lock);
+	for_each_tracing_cpu(cpu) {
+		/*
+		 * Increase/decrease the disabled counter if we are
+		 * about to flip a bit in the cpumask:
+		 */
+		if (cpu_isset(cpu, tracing_cpumask) &&
+				!cpu_isset(cpu, tracing_cpumask_new)) {
+			atomic_inc(&global_trace.data[cpu]->disabled);
+		}
+		if (!cpu_isset(cpu, tracing_cpumask) &&
+				cpu_isset(cpu, tracing_cpumask_new)) {
+			atomic_dec(&global_trace.data[cpu]->disabled);
+		}
+	}
+	__raw_spin_unlock(&ftrace_max_lock);
+	raw_local_irq_enable();
+
+	tracing_cpumask = tracing_cpumask_new;
+
+	mutex_unlock(&tracing_cpumask_update_lock);
+
+	return count;
+
+err_unlock:
+	mutex_unlock(&tracing_cpumask_update_lock);
+
+	return err;
+}
+
+static struct file_operations tracing_cpumask_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_cpumask_read,
+	.write		= tracing_cpumask_write,
+};
+
+static ssize_t
+tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	char *buf;
+	int r = 0;
+	int len = 0;
+	int i;
+
+	/* calulate max size */
+	for (i = 0; trace_options[i]; i++) {
+		len += strlen(trace_options[i]);
+		len += 3; /* "no" and space */
+	}
+
+	/* +2 for \n and \0 */
+	buf = kmalloc(len + 2, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	for (i = 0; trace_options[i]; i++) {
+		if (trace_flags & (1 << i))
+			r += sprintf(buf + r, "%s ", trace_options[i]);
+		else
+			r += sprintf(buf + r, "no%s ", trace_options[i]);
+	}
+
+	r += sprintf(buf + r, "\n");
+	WARN_ON(r >= len + 2);
+
+	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+
+	kfree(buf);
+
+	return r;
+}
+
+static ssize_t
+tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
+			size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	char *cmp = buf;
+	int neg = 0;
+	int i;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	if (strncmp(buf, "no", 2) == 0) {
+		neg = 1;
+		cmp += 2;
+	}
+
+	for (i = 0; trace_options[i]; i++) {
+		int len = strlen(trace_options[i]);
+
+		if (strncmp(cmp, trace_options[i], len) == 0) {
+			if (neg)
+				trace_flags &= ~(1 << i);
+			else
+				trace_flags |= (1 << i);
+			break;
+		}
+	}
+	/*
+	 * If no option could be set, return an error:
+	 */
+	if (!trace_options[i])
+		return -EINVAL;
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static struct file_operations tracing_iter_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_iter_ctrl_read,
+	.write		= tracing_iter_ctrl_write,
+};
+
+static const char readme_msg[] =
+	"tracing mini-HOWTO:\n\n"
+	"# mkdir /debug\n"
+	"# mount -t debugfs nodev /debug\n\n"
+	"# cat /debug/tracing/available_tracers\n"
+	"wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
+	"# cat /debug/tracing/current_tracer\n"
+	"none\n"
+	"# echo sched_switch > /debug/tracing/current_tracer\n"
+	"# cat /debug/tracing/current_tracer\n"
+	"sched_switch\n"
+	"# cat /debug/tracing/iter_ctrl\n"
+	"noprint-parent nosym-offset nosym-addr noverbose\n"
+	"# echo print-parent > /debug/tracing/iter_ctrl\n"
+	"# echo 1 > /debug/tracing/tracing_enabled\n"
+	"# cat /debug/tracing/trace > /tmp/trace.txt\n"
+	"echo 0 > /debug/tracing/tracing_enabled\n"
+;
+
+static ssize_t
+tracing_readme_read(struct file *filp, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	return simple_read_from_buffer(ubuf, cnt, ppos,
+					readme_msg, strlen(readme_msg));
+}
+
+static struct file_operations tracing_readme_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_readme_read,
+};
+
+static ssize_t
+tracing_ctrl_read(struct file *filp, char __user *ubuf,
+		  size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%ld\n", tr->ctrl);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_ctrl_write(struct file *filp, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	val = !!val;
+
+	mutex_lock(&trace_types_lock);
+	if (tr->ctrl ^ val) {
+		if (val)
+			tracer_enabled = 1;
+		else
+			tracer_enabled = 0;
+
+		tr->ctrl = val;
+
+		if (current_trace && current_trace->ctrl_update)
+			current_trace->ctrl_update(tr);
+	}
+	mutex_unlock(&trace_types_lock);
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static ssize_t
+tracing_set_trace_read(struct file *filp, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	char buf[max_tracer_type_len+2];
+	int r;
+
+	mutex_lock(&trace_types_lock);
+	if (current_trace)
+		r = sprintf(buf, "%s\n", current_trace->name);
+	else
+		r = sprintf(buf, "\n");
+	mutex_unlock(&trace_types_lock);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+			size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = &global_trace;
+	struct tracer *t;
+	char buf[max_tracer_type_len+1];
+	int i;
+
+	if (cnt > max_tracer_type_len)
+		cnt = max_tracer_type_len;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	/* strip ending whitespace. */
+	for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+		buf[i] = 0;
+
+	mutex_lock(&trace_types_lock);
+	for (t = trace_types; t; t = t->next) {
+		if (strcmp(t->name, buf) == 0)
+			break;
+	}
+	if (!t || t == current_trace)
+		goto out;
+
+	if (current_trace && current_trace->reset)
+		current_trace->reset(tr);
+
+	current_trace = t;
+	if (t->init)
+		t->init(tr);
+
+ out:
+	mutex_unlock(&trace_types_lock);
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	unsigned long *ptr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = snprintf(buf, sizeof(buf), "%ld\n",
+		     *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr));
+	if (r > sizeof(buf))
+		r = sizeof(buf);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	long *ptr = filp->private_data;
+	char buf[64];
+	long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	*ptr = val * 1000;
+
+	return cnt;
+}
+
+static atomic_t tracing_reader;
+
+static int tracing_open_pipe(struct inode *inode, struct file *filp)
+{
+	struct trace_iterator *iter;
+
+	if (tracing_disabled)
+		return -ENODEV;
+
+	/* We only allow for reader of the pipe */
+	if (atomic_inc_return(&tracing_reader) != 1) {
+		atomic_dec(&tracing_reader);
+		return -EBUSY;
+	}
+
+	/* create a buffer to store the information to pass to userspace */
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
+	mutex_lock(&trace_types_lock);
+	iter->tr = &global_trace;
+	iter->trace = current_trace;
+	filp->private_data = iter;
+
+	if (iter->trace->pipe_open)
+		iter->trace->pipe_open(iter);
+	mutex_unlock(&trace_types_lock);
+
+	return 0;
+}
+
+static int tracing_release_pipe(struct inode *inode, struct file *file)
+{
+	struct trace_iterator *iter = file->private_data;
+
+	kfree(iter);
+	atomic_dec(&tracing_reader);
+
+	return 0;
+}
+
+static unsigned int
+tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+{
+	struct trace_iterator *iter = filp->private_data;
+
+	if (trace_flags & TRACE_ITER_BLOCK) {
+		/*
+		 * Always select as readable when in blocking mode
+		 */
+		return POLLIN | POLLRDNORM;
+	} else {
+		if (!trace_empty(iter))
+			return POLLIN | POLLRDNORM;
+		poll_wait(filp, &trace_wait, poll_table);
+		if (!trace_empty(iter))
+			return POLLIN | POLLRDNORM;
+
+		return 0;
+	}
+}
+
+/*
+ * Consumer reader.
+ */
+static ssize_t
+tracing_read_pipe(struct file *filp, char __user *ubuf,
+		  size_t cnt, loff_t *ppos)
+{
+	struct trace_iterator *iter = filp->private_data;
+	struct trace_array_cpu *data;
+	static cpumask_t mask;
+	unsigned long flags;
+#ifdef CONFIG_FTRACE
+	int ftrace_save;
+#endif
+	int cpu;
+	ssize_t sret;
+
+	/* return any leftover data */
+	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+	if (sret != -EBUSY)
+		return sret;
+	sret = 0;
+
+	trace_seq_reset(&iter->seq);
+
+	mutex_lock(&trace_types_lock);
+	if (iter->trace->read) {
+		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
+		if (sret)
+			goto out;
+	}
+
+	while (trace_empty(iter)) {
+
+		if ((filp->f_flags & O_NONBLOCK)) {
+			sret = -EAGAIN;
+			goto out;
+		}
+
+		/*
+		 * This is a make-shift waitqueue. The reason we don't use
+		 * an actual wait queue is because:
+		 *  1) we only ever have one waiter
+		 *  2) the tracing, traces all functions, we don't want
+		 *     the overhead of calling wake_up and friends
+		 *     (and tracing them too)
+		 *     Anyway, this is really very primitive wakeup.
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+		iter->tr->waiter = current;
+
+		mutex_unlock(&trace_types_lock);
+
+		/* sleep for 100 msecs, and try again. */
+		schedule_timeout(HZ/10);
+
+		mutex_lock(&trace_types_lock);
+
+		iter->tr->waiter = NULL;
+
+		if (signal_pending(current)) {
+			sret = -EINTR;
+			goto out;
+		}
+
+		if (iter->trace != current_trace)
+			goto out;
+
+		/*
+		 * We block until we read something and tracing is disabled.
+		 * We still block if tracing is disabled, but we have never
+		 * read anything. This allows a user to cat this file, and
+		 * then enable tracing. But after we have read something,
+		 * we give an EOF when tracing is again disabled.
+		 *
+		 * iter->pos will be 0 if we haven't read anything.
+		 */
+		if (!tracer_enabled && iter->pos)
+			break;
+
+		continue;
+	}
+
+	/* stop when tracing is finished */
+	if (trace_empty(iter))
+		goto out;
+
+	if (cnt >= PAGE_SIZE)
+		cnt = PAGE_SIZE - 1;
+
+	/* reset all but tr, trace, and overruns */
+	memset(&iter->seq, 0,
+	       sizeof(struct trace_iterator) -
+	       offsetof(struct trace_iterator, seq));
+	iter->pos = -1;
+
+	/*
+	 * We need to stop all tracing on all CPUS to read the
+	 * the next buffer. This is a bit expensive, but is
+	 * not done often. We fill all what we can read,
+	 * and then release the locks again.
+	 */
+
+	cpus_clear(mask);
+	local_irq_save(flags);
+#ifdef CONFIG_FTRACE
+	ftrace_save = ftrace_enabled;
+	ftrace_enabled = 0;
+#endif
+	smp_wmb();
+	for_each_tracing_cpu(cpu) {
+		data = iter->tr->data[cpu];
+
+		if (!head_page(data) || !data->trace_idx)
+			continue;
+
+		atomic_inc(&data->disabled);
+		cpu_set(cpu, mask);
+	}
+
+	for_each_cpu_mask(cpu, mask) {
+		data = iter->tr->data[cpu];
+		__raw_spin_lock(&data->lock);
+
+		if (data->overrun > iter->last_overrun[cpu])
+			iter->overrun[cpu] +=
+				data->overrun - iter->last_overrun[cpu];
+		iter->last_overrun[cpu] = data->overrun;
+	}
+
+	while (find_next_entry_inc(iter) != NULL) {
+		int ret;
+		int len = iter->seq.len;
+
+		ret = print_trace_line(iter);
+		if (!ret) {
+			/* don't print partial lines */
+			iter->seq.len = len;
+			break;
+		}
+
+		trace_consume(iter);
+
+		if (iter->seq.len >= cnt)
+			break;
+	}
+
+	for_each_cpu_mask(cpu, mask) {
+		data = iter->tr->data[cpu];
+		__raw_spin_unlock(&data->lock);
+	}
+
+	for_each_cpu_mask(cpu, mask) {
+		data = iter->tr->data[cpu];
+		atomic_dec(&data->disabled);
+	}
+#ifdef CONFIG_FTRACE
+	ftrace_enabled = ftrace_save;
+#endif
+	local_irq_restore(flags);
+
+	/* Now copy what we have to the user */
+	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+	if (iter->seq.readpos >= iter->seq.len)
+		trace_seq_reset(&iter->seq);
+	if (sret == -EBUSY)
+		sret = 0;
+
+out:
+	mutex_unlock(&trace_types_lock);
+
+	return sret;
+}
+
+static ssize_t
+tracing_entries_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%lu\n", tr->entries);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_entries_write(struct file *filp, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	unsigned long val;
+	char buf[64];
+	int i, ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	/* must have at least 1 entry */
+	if (!val)
+		return -EINVAL;
+
+	mutex_lock(&trace_types_lock);
+
+	if (current_trace != &no_tracer) {
+		cnt = -EBUSY;
+		pr_info("ftrace: set current_tracer to none"
+			" before modifying buffer size\n");
+		goto out;
+	}
+
+	if (val > global_trace.entries) {
+		long pages_requested;
+		unsigned long freeable_pages;
+
+		/* make sure we have enough memory before mapping */
+		pages_requested =
+			(val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE;
+
+		/* account for each buffer (and max_tr) */
+		pages_requested *= tracing_nr_buffers * 2;
+
+		/* Check for overflow */
+		if (pages_requested < 0) {
+			cnt = -ENOMEM;
+			goto out;
+		}
+
+		freeable_pages = determine_dirtyable_memory();
+
+		/* we only allow to request 1/4 of useable memory */
+		if (pages_requested >
+		    ((freeable_pages + tracing_pages_allocated) / 4)) {
+			cnt = -ENOMEM;
+			goto out;
+		}
+
+		while (global_trace.entries < val) {
+			if (trace_alloc_page()) {
+				cnt = -ENOMEM;
+				goto out;
+			}
+			/* double check that we don't go over the known pages */
+			if (tracing_pages_allocated > pages_requested)
+				break;
+		}
+
+	} else {
+		/* include the number of entries in val (inc of page entries) */
+		while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
+			trace_free_page();
+	}
+
+	/* check integrity */
+	for_each_tracing_cpu(i)
+		check_pages(global_trace.data[i]);
+
+	filp->f_pos += cnt;
+
+	/* If check pages failed, return ENOMEM */
+	if (tracing_disabled)
+		cnt = -ENOMEM;
+ out:
+	max_tr.entries = global_trace.entries;
+	mutex_unlock(&trace_types_lock);
+
+	return cnt;
+}
+
+static struct file_operations tracing_max_lat_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_max_lat_read,
+	.write		= tracing_max_lat_write,
+};
+
+static struct file_operations tracing_ctrl_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_ctrl_read,
+	.write		= tracing_ctrl_write,
+};
+
+static struct file_operations set_tracer_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_set_trace_read,
+	.write		= tracing_set_trace_write,
+};
+
+static struct file_operations tracing_pipe_fops = {
+	.open		= tracing_open_pipe,
+	.poll		= tracing_poll_pipe,
+	.read		= tracing_read_pipe,
+	.release	= tracing_release_pipe,
+};
+
+static struct file_operations tracing_entries_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_entries_read,
+	.write		= tracing_entries_write,
+};
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static ssize_t
+tracing_read_long(struct file *filp, char __user *ubuf,
+		  size_t cnt, loff_t *ppos)
+{
+	unsigned long *p = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%ld\n", *p);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static struct file_operations tracing_read_long_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_read_long,
+};
+#endif
+
+static struct dentry *d_tracer;
+
+struct dentry *tracing_init_dentry(void)
+{
+	static int once;
+
+	if (d_tracer)
+		return d_tracer;
+
+	d_tracer = debugfs_create_dir("tracing", NULL);
+
+	if (!d_tracer && !once) {
+		once = 1;
+		pr_warning("Could not create debugfs directory 'tracing'\n");
+		return NULL;
+	}
+
+	return d_tracer;
+}
+
+#ifdef CONFIG_FTRACE_SELFTEST
+/* Let selftest have access to static functions in this file */
+#include "trace_selftest.c"
+#endif
+
+static __init void tracer_init_debugfs(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
+				    &global_trace, &tracing_ctrl_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
+
+	entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+				    NULL, &tracing_iter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+
+	entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
+				    NULL, &tracing_cpumask_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
+
+	entry = debugfs_create_file("latency_trace", 0444, d_tracer,
+				    &global_trace, &tracing_lt_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'latency_trace' entry\n");
+
+	entry = debugfs_create_file("trace", 0444, d_tracer,
+				    &global_trace, &tracing_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'trace' entry\n");
+
+	entry = debugfs_create_file("available_tracers", 0444, d_tracer,
+				    &global_trace, &show_traces_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'trace' entry\n");
+
+	entry = debugfs_create_file("current_tracer", 0444, d_tracer,
+				    &global_trace, &set_tracer_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'trace' entry\n");
+
+	entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
+				    &tracing_max_latency,
+				    &tracing_max_lat_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_max_latency' entry\n");
+
+	entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
+				    &tracing_thresh, &tracing_max_lat_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_threash' entry\n");
+	entry = debugfs_create_file("README", 0644, d_tracer,
+				    NULL, &tracing_readme_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'README' entry\n");
+
+	entry = debugfs_create_file("trace_pipe", 0644, d_tracer,
+				    NULL, &tracing_pipe_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_threash' entry\n");
+
+	entry = debugfs_create_file("trace_entries", 0644, d_tracer,
+				    &global_trace, &tracing_entries_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_threash' entry\n");
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+				    &ftrace_update_tot_cnt,
+				    &tracing_read_long_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'dyn_ftrace_total_info' entry\n");
+#endif
+#ifdef CONFIG_SYSPROF_TRACER
+	init_tracer_sysprof_debugfs(d_tracer);
+#endif
+}
+
+static int trace_alloc_page(void)
+{
+	struct trace_array_cpu *data;
+	struct page *page, *tmp;
+	LIST_HEAD(pages);
+	void *array;
+	unsigned pages_allocated = 0;
+	int i;
+
+	/* first allocate a page for each CPU */
+	for_each_tracing_cpu(i) {
+		array = (void *)__get_free_page(GFP_KERNEL);
+		if (array == NULL) {
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
+			goto free_pages;
+		}
+
+		pages_allocated++;
+		page = virt_to_page(array);
+		list_add(&page->lru, &pages);
+
+/* Only allocate if we are actually using the max trace */
+#ifdef CONFIG_TRACER_MAX_TRACE
+		array = (void *)__get_free_page(GFP_KERNEL);
+		if (array == NULL) {
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
+			goto free_pages;
+		}
+		pages_allocated++;
+		page = virt_to_page(array);
+		list_add(&page->lru, &pages);
+#endif
+	}
+
+	/* Now that we successfully allocate a page per CPU, add them */
+	for_each_tracing_cpu(i) {
+		data = global_trace.data[i];
+		page = list_entry(pages.next, struct page, lru);
+		list_del_init(&page->lru);
+		list_add_tail(&page->lru, &data->trace_pages);
+		ClearPageLRU(page);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+		data = max_tr.data[i];
+		page = list_entry(pages.next, struct page, lru);
+		list_del_init(&page->lru);
+		list_add_tail(&page->lru, &data->trace_pages);
+		SetPageLRU(page);
+#endif
+	}
+	tracing_pages_allocated += pages_allocated;
+	global_trace.entries += ENTRIES_PER_PAGE;
+
+	return 0;
+
+ free_pages:
+	list_for_each_entry_safe(page, tmp, &pages, lru) {
+		list_del_init(&page->lru);
+		__free_page(page);
+	}
+	return -ENOMEM;
+}
+
+static int trace_free_page(void)
+{
+	struct trace_array_cpu *data;
+	struct page *page;
+	struct list_head *p;
+	int i;
+	int ret = 0;
+
+	/* free one page from each buffer */
+	for_each_tracing_cpu(i) {
+		data = global_trace.data[i];
+		p = data->trace_pages.next;
+		if (p == &data->trace_pages) {
+			/* should never happen */
+			WARN_ON(1);
+			tracing_disabled = 1;
+			ret = -1;
+			break;
+		}
+		page = list_entry(p, struct page, lru);
+		ClearPageLRU(page);
+		list_del(&page->lru);
+		tracing_pages_allocated--;
+		tracing_pages_allocated--;
+		__free_page(page);
+
+		tracing_reset(data);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+		data = max_tr.data[i];
+		p = data->trace_pages.next;
+		if (p == &data->trace_pages) {
+			/* should never happen */
+			WARN_ON(1);
+			tracing_disabled = 1;
+			ret = -1;
+			break;
+		}
+		page = list_entry(p, struct page, lru);
+		ClearPageLRU(page);
+		list_del(&page->lru);
+		__free_page(page);
+
+		tracing_reset(data);
+#endif
+	}
+	global_trace.entries -= ENTRIES_PER_PAGE;
+
+	return ret;
+}
+
+__init static int tracer_alloc_buffers(void)
+{
+	struct trace_array_cpu *data;
+	void *array;
+	struct page *page;
+	int pages = 0;
+	int ret = -ENOMEM;
+	int i;
+
+	/* TODO: make the number of buffers hot pluggable with CPUS */
+	tracing_nr_buffers = num_possible_cpus();
+	tracing_buffer_mask = cpu_possible_map;
+
+	/* Allocate the first page for all buffers */
+	for_each_tracing_cpu(i) {
+		data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
+		max_tr.data[i] = &per_cpu(max_data, i);
+
+		array = (void *)__get_free_page(GFP_KERNEL);
+		if (array == NULL) {
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
+			goto free_buffers;
+		}
+
+		/* set the array to the list */
+		INIT_LIST_HEAD(&data->trace_pages);
+		page = virt_to_page(array);
+		list_add(&page->lru, &data->trace_pages);
+		/* use the LRU flag to differentiate the two buffers */
+		ClearPageLRU(page);
+
+		data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+		max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+/* Only allocate if we are actually using the max trace */
+#ifdef CONFIG_TRACER_MAX_TRACE
+		array = (void *)__get_free_page(GFP_KERNEL);
+		if (array == NULL) {
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
+			goto free_buffers;
+		}
+
+		INIT_LIST_HEAD(&max_tr.data[i]->trace_pages);
+		page = virt_to_page(array);
+		list_add(&page->lru, &max_tr.data[i]->trace_pages);
+		SetPageLRU(page);
+#endif
+	}
+
+	/*
+	 * Since we allocate by orders of pages, we may be able to
+	 * round up a bit.
+	 */
+	global_trace.entries = ENTRIES_PER_PAGE;
+	pages++;
+
+	while (global_trace.entries < trace_nr_entries) {
+		if (trace_alloc_page())
+			break;
+		pages++;
+	}
+	max_tr.entries = global_trace.entries;
+
+	pr_info("tracer: %d pages allocated for %ld",
+		pages, trace_nr_entries);
+	pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE);
+	pr_info("   actual entries %ld\n", global_trace.entries);
+
+	tracer_init_debugfs();
+
+	trace_init_cmdlines();
+
+	register_tracer(&no_tracer);
+	current_trace = &no_tracer;
+
+	/* All seems OK, enable tracing */
+	global_trace.ctrl = tracer_enabled;
+	tracing_disabled = 0;
+
+	return 0;
+
+ free_buffers:
+	for (i-- ; i >= 0; i--) {
+		struct page *page, *tmp;
+		struct trace_array_cpu *data = global_trace.data[i];
+
+		if (data) {
+			list_for_each_entry_safe(page, tmp,
+						 &data->trace_pages, lru) {
+				list_del_init(&page->lru);
+				__free_page(page);
+			}
+		}
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+		data = max_tr.data[i];
+		if (data) {
+			list_for_each_entry_safe(page, tmp,
+						 &data->trace_pages, lru) {
+				list_del_init(&page->lru);
+				__free_page(page);
+			}
+		}
+#endif
+	}
+	return ret;
+}
+fs_initcall(tracer_alloc_buffers);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
new file mode 100644
index 000000000000..8cb215b239d5
--- /dev/null
+++ b/kernel/trace/trace.h
@@ -0,0 +1,333 @@
+#ifndef _LINUX_KERNEL_TRACE_H
+#define _LINUX_KERNEL_TRACE_H
+
+#include <linux/fs.h>
+#include <asm/atomic.h>
+#include <linux/sched.h>
+#include <linux/clocksource.h>
+#include <linux/mmiotrace.h>
+
+enum trace_type {
+	__TRACE_FIRST_TYPE = 0,
+
+	TRACE_FN,
+	TRACE_CTX,
+	TRACE_WAKE,
+	TRACE_STACK,
+	TRACE_SPECIAL,
+	TRACE_MMIO_RW,
+	TRACE_MMIO_MAP,
+
+	__TRACE_LAST_TYPE
+};
+
+/*
+ * Function trace entry - function address and parent function addres:
+ */
+struct ftrace_entry {
+	unsigned long		ip;
+	unsigned long		parent_ip;
+};
+
+/*
+ * Context switch trace entry - which task (and prio) we switched from/to:
+ */
+struct ctx_switch_entry {
+	unsigned int		prev_pid;
+	unsigned char		prev_prio;
+	unsigned char		prev_state;
+	unsigned int		next_pid;
+	unsigned char		next_prio;
+	unsigned char		next_state;
+};
+
+/*
+ * Special (free-form) trace entry:
+ */
+struct special_entry {
+	unsigned long		arg1;
+	unsigned long		arg2;
+	unsigned long		arg3;
+};
+
+/*
+ * Stack-trace entry:
+ */
+
+#define FTRACE_STACK_ENTRIES	8
+
+struct stack_entry {
+	unsigned long		caller[FTRACE_STACK_ENTRIES];
+};
+
+/*
+ * The trace entry - the most basic unit of tracing. This is what
+ * is printed in the end as a single line in the trace output, such as:
+ *
+ *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
+ */
+struct trace_entry {
+	char			type;
+	char			cpu;
+	char			flags;
+	char			preempt_count;
+	int			pid;
+	cycle_t			t;
+	union {
+		struct ftrace_entry		fn;
+		struct ctx_switch_entry		ctx;
+		struct special_entry		special;
+		struct stack_entry		stack;
+		struct mmiotrace_rw		mmiorw;
+		struct mmiotrace_map		mmiomap;
+	};
+};
+
+#define TRACE_ENTRY_SIZE	sizeof(struct trace_entry)
+
+/*
+ * The CPU trace array - it consists of thousands of trace entries
+ * plus some other descriptor data: (for example which task started
+ * the trace, etc.)
+ */
+struct trace_array_cpu {
+	struct list_head	trace_pages;
+	atomic_t		disabled;
+	raw_spinlock_t		lock;
+	struct lock_class_key	lock_key;
+
+	/* these fields get copied into max-trace: */
+	unsigned		trace_head_idx;
+	unsigned		trace_tail_idx;
+	void			*trace_head; /* producer */
+	void			*trace_tail; /* consumer */
+	unsigned long		trace_idx;
+	unsigned long		overrun;
+	unsigned long		saved_latency;
+	unsigned long		critical_start;
+	unsigned long		critical_end;
+	unsigned long		critical_sequence;
+	unsigned long		nice;
+	unsigned long		policy;
+	unsigned long		rt_priority;
+	cycle_t			preempt_timestamp;
+	pid_t			pid;
+	uid_t			uid;
+	char			comm[TASK_COMM_LEN];
+};
+
+struct trace_iterator;
+
+/*
+ * The trace array - an array of per-CPU trace arrays. This is the
+ * highest level data structure that individual tracers deal with.
+ * They have on/off state as well:
+ */
+struct trace_array {
+	unsigned long		entries;
+	long			ctrl;
+	int			cpu;
+	cycle_t			time_start;
+	struct task_struct	*waiter;
+	struct trace_array_cpu	*data[NR_CPUS];
+};
+
+/*
+ * A specific tracer, represented by methods that operate on a trace array:
+ */
+struct tracer {
+	const char		*name;
+	void			(*init)(struct trace_array *tr);
+	void			(*reset)(struct trace_array *tr);
+	void			(*open)(struct trace_iterator *iter);
+	void			(*pipe_open)(struct trace_iterator *iter);
+	void			(*close)(struct trace_iterator *iter);
+	void			(*start)(struct trace_iterator *iter);
+	void			(*stop)(struct trace_iterator *iter);
+	ssize_t			(*read)(struct trace_iterator *iter,
+					struct file *filp, char __user *ubuf,
+					size_t cnt, loff_t *ppos);
+	void			(*ctrl_update)(struct trace_array *tr);
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+	int			(*selftest)(struct tracer *trace,
+					    struct trace_array *tr);
+#endif
+	int			(*print_line)(struct trace_iterator *iter);
+	struct tracer		*next;
+	int			print_max;
+};
+
+struct trace_seq {
+	unsigned char		buffer[PAGE_SIZE];
+	unsigned int		len;
+	unsigned int		readpos;
+};
+
+/*
+ * Trace iterator - used by printout routines who present trace
+ * results to users and which routines might sleep, etc:
+ */
+struct trace_iterator {
+	struct trace_array	*tr;
+	struct tracer		*trace;
+	void			*private;
+	long			last_overrun[NR_CPUS];
+	long			overrun[NR_CPUS];
+
+	/* The below is zeroed out in pipe_read */
+	struct trace_seq	seq;
+	struct trace_entry	*ent;
+	int			cpu;
+
+	struct trace_entry	*prev_ent;
+	int			prev_cpu;
+
+	unsigned long		iter_flags;
+	loff_t			pos;
+	unsigned long		next_idx[NR_CPUS];
+	struct list_head	*next_page[NR_CPUS];
+	unsigned		next_page_idx[NR_CPUS];
+	long			idx;
+};
+
+void tracing_reset(struct trace_array_cpu *data);
+int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *tracing_init_dentry(void);
+void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
+
+void ftrace(struct trace_array *tr,
+			    struct trace_array_cpu *data,
+			    unsigned long ip,
+			    unsigned long parent_ip,
+			    unsigned long flags);
+void tracing_sched_switch_trace(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct task_struct *prev,
+				struct task_struct *next,
+				unsigned long flags);
+void tracing_record_cmdline(struct task_struct *tsk);
+
+void tracing_sched_wakeup_trace(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct task_struct *wakee,
+				struct task_struct *cur,
+				unsigned long flags);
+void trace_special(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long arg1,
+		   unsigned long arg2,
+		   unsigned long arg3);
+void trace_function(struct trace_array *tr,
+		    struct trace_array_cpu *data,
+		    unsigned long ip,
+		    unsigned long parent_ip,
+		    unsigned long flags);
+
+void tracing_start_function_trace(void);
+void tracing_stop_function_trace(void);
+void tracing_start_cmdline_record(void);
+void tracing_stop_cmdline_record(void);
+int register_tracer(struct tracer *type);
+void unregister_tracer(struct tracer *type);
+
+extern unsigned long nsecs_to_usecs(unsigned long nsecs);
+
+extern unsigned long tracing_max_latency;
+extern unsigned long tracing_thresh;
+
+void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
+void update_max_tr_single(struct trace_array *tr,
+			  struct task_struct *tsk, int cpu);
+
+extern cycle_t ftrace_now(int cpu);
+
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+typedef void
+(*tracer_switch_func_t)(void *private,
+			void *__rq,
+			struct task_struct *prev,
+			struct task_struct *next);
+
+struct tracer_switch_ops {
+	tracer_switch_func_t		func;
+	void				*private;
+	struct tracer_switch_ops	*next;
+};
+
+#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern unsigned long ftrace_update_tot_cnt;
+#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
+extern int DYN_FTRACE_TEST_NAME(void);
+#endif
+
+#ifdef CONFIG_MMIOTRACE
+extern void __trace_mmiotrace_rw(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct mmiotrace_rw *rw);
+extern void __trace_mmiotrace_map(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct mmiotrace_map *map);
+#endif
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+#ifdef CONFIG_FTRACE
+extern int trace_selftest_startup_function(struct tracer *trace,
+					   struct trace_array *tr);
+#endif
+#ifdef CONFIG_IRQSOFF_TRACER
+extern int trace_selftest_startup_irqsoff(struct tracer *trace,
+					  struct trace_array *tr);
+#endif
+#ifdef CONFIG_PREEMPT_TRACER
+extern int trace_selftest_startup_preemptoff(struct tracer *trace,
+					     struct trace_array *tr);
+#endif
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
+extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
+						 struct trace_array *tr);
+#endif
+#ifdef CONFIG_SCHED_TRACER
+extern int trace_selftest_startup_wakeup(struct tracer *trace,
+					 struct trace_array *tr);
+#endif
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+extern int trace_selftest_startup_sched_switch(struct tracer *trace,
+					       struct trace_array *tr);
+#endif
+#ifdef CONFIG_SYSPROF_TRACER
+extern int trace_selftest_startup_sysprof(struct tracer *trace,
+					       struct trace_array *tr);
+#endif
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
+
+extern void *head_page(struct trace_array_cpu *data);
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+				 size_t cnt);
+extern long ns2usecs(cycle_t nsec);
+
+extern unsigned long trace_flags;
+
+/*
+ * trace_iterator_flags is an enumeration that defines bit
+ * positions into trace_flags that controls the output.
+ *
+ * NOTE: These bits must match the trace_options array in
+ *       trace.c.
+ */
+enum trace_iterator_flags {
+	TRACE_ITER_PRINT_PARENT		= 0x01,
+	TRACE_ITER_SYM_OFFSET		= 0x02,
+	TRACE_ITER_SYM_ADDR		= 0x04,
+	TRACE_ITER_VERBOSE		= 0x08,
+	TRACE_ITER_RAW			= 0x10,
+	TRACE_ITER_HEX			= 0x20,
+	TRACE_ITER_BIN			= 0x40,
+	TRACE_ITER_BLOCK		= 0x80,
+	TRACE_ITER_STACKTRACE		= 0x100,
+	TRACE_ITER_SCHED_TREE		= 0x200,
+};
+
+#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
new file mode 100644
index 000000000000..7ee7dcd76b7d
--- /dev/null
+++ b/kernel/trace/trace_functions.c
@@ -0,0 +1,78 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+static void function_reset(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
+
+static void start_function_trace(struct trace_array *tr)
+{
+	function_reset(tr);
+	tracing_start_cmdline_record();
+	tracing_start_function_trace();
+}
+
+static void stop_function_trace(struct trace_array *tr)
+{
+	tracing_stop_function_trace();
+	tracing_stop_cmdline_record();
+}
+
+static void function_trace_init(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_function_trace(tr);
+}
+
+static void function_trace_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_function_trace(tr);
+}
+
+static void function_trace_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_function_trace(tr);
+	else
+		stop_function_trace(tr);
+}
+
+static struct tracer function_trace __read_mostly =
+{
+	.name	     = "ftrace",
+	.init	     = function_trace_init,
+	.reset	     = function_trace_reset,
+	.ctrl_update = function_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_function,
+#endif
+};
+
+static __init int init_function_trace(void)
+{
+	return register_tracer(&function_trace);
+}
+
+device_initcall(init_function_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
new file mode 100644
index 000000000000..421d6fe3650e
--- /dev/null
+++ b/kernel/trace/trace_irqsoff.c
@@ -0,0 +1,486 @@
+/*
+ * trace irqs off criticall timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * From code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+static struct trace_array		*irqsoff_trace __read_mostly;
+static int				tracer_enabled __read_mostly;
+
+static DEFINE_PER_CPU(int, tracing_cpu);
+
+static DEFINE_SPINLOCK(max_trace_lock);
+
+enum {
+	TRACER_IRQS_OFF		= (1 << 1),
+	TRACER_PREEMPT_OFF	= (1 << 2),
+};
+
+static int trace_type __read_mostly;
+
+#ifdef CONFIG_PREEMPT_TRACER
+static inline int
+preempt_trace(void)
+{
+	return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
+}
+#else
+# define preempt_trace() (0)
+#endif
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static inline int
+irq_trace(void)
+{
+	return ((trace_type & TRACER_IRQS_OFF) &&
+		irqs_disabled());
+}
+#else
+# define irq_trace() (0)
+#endif
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp	unsigned long max_sequence;
+
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	/*
+	 * Does not matter if we preempt. We test the flags
+	 * afterward, to see if irqs are disabled or not.
+	 * If we preempt and get a false positive, the flags
+	 * test will fail.
+	 */
+	cpu = raw_smp_processor_id();
+	if (likely(!per_cpu(tracing_cpu, cpu)))
+		return;
+
+	local_save_flags(flags);
+	/* slight chance to get a false positive on tracing_cpu */
+	if (!irqs_disabled_flags(flags))
+		return;
+
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		trace_function(tr, data, ip, parent_ip, flags);
+
+	atomic_dec(&data->disabled);
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = irqsoff_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycle_t delta)
+{
+	if (tracing_thresh) {
+		if (delta < tracing_thresh)
+			return 0;
+	} else {
+		if (delta <= tracing_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+static void
+check_critical_timing(struct trace_array *tr,
+		      struct trace_array_cpu *data,
+		      unsigned long parent_ip,
+		      int cpu)
+{
+	unsigned long latency, t0, t1;
+	cycle_t T0, T1, delta;
+	unsigned long flags;
+
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = data->preempt_timestamp;
+	T1 = ftrace_now(cpu);
+	delta = T1-T0;
+
+	local_save_flags(flags);
+
+	if (!report_latency(delta))
+		goto out;
+
+	spin_lock_irqsave(&max_trace_lock, flags);
+
+	/* check if we are still the max latency */
+	if (!report_latency(delta))
+		goto out_unlock;
+
+	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
+
+	latency = nsecs_to_usecs(delta);
+
+	if (data->critical_sequence != max_sequence)
+		goto out_unlock;
+
+	tracing_max_latency = delta;
+	t0 = nsecs_to_usecs(T0);
+	t1 = nsecs_to_usecs(T1);
+
+	data->critical_end = parent_ip;
+
+	update_max_tr_single(tr, current, cpu);
+
+	max_sequence++;
+
+out_unlock:
+	spin_unlock_irqrestore(&max_trace_lock, flags);
+
+out:
+	data->critical_sequence = max_sequence;
+	data->preempt_timestamp = ftrace_now(cpu);
+	tracing_reset(data);
+	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
+}
+
+static inline void
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+	int cpu;
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	cpu = raw_smp_processor_id();
+
+	if (per_cpu(tracing_cpu, cpu))
+		return;
+
+	data = tr->data[cpu];
+
+	if (unlikely(!data) || atomic_read(&data->disabled))
+		return;
+
+	atomic_inc(&data->disabled);
+
+	data->critical_sequence = max_sequence;
+	data->preempt_timestamp = ftrace_now(cpu);
+	data->critical_start = parent_ip ? : ip;
+	tracing_reset(data);
+
+	local_save_flags(flags);
+
+	trace_function(tr, data, ip, parent_ip, flags);
+
+	per_cpu(tracing_cpu, cpu) = 1;
+
+	atomic_dec(&data->disabled);
+}
+
+static inline void
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+	int cpu;
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	cpu = raw_smp_processor_id();
+	/* Always clear the tracing cpu on stopping the trace */
+	if (unlikely(per_cpu(tracing_cpu, cpu)))
+		per_cpu(tracing_cpu, cpu) = 0;
+	else
+		return;
+
+	if (!tracer_enabled)
+		return;
+
+	data = tr->data[cpu];
+
+	if (unlikely(!data) || unlikely(!head_page(data)) ||
+	    !data->critical_start || atomic_read(&data->disabled))
+		return;
+
+	atomic_inc(&data->disabled);
+
+	local_save_flags(flags);
+	trace_function(tr, data, ip, parent_ip, flags);
+	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
+	data->critical_start = 0;
+	atomic_dec(&data->disabled);
+}
+
+/* start and stop critical timings used to for stoppage (in idle) */
+void start_critical_timings(void)
+{
+	if (preempt_trace() || irq_trace())
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+void stop_critical_timings(void)
+{
+	if (preempt_trace() || irq_trace())
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+#ifdef CONFIG_IRQSOFF_TRACER
+#ifdef CONFIG_PROVE_LOCKING
+void time_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+	if (!preempt_trace() && irq_trace())
+		stop_critical_timing(a0, a1);
+}
+
+void time_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+	if (!preempt_trace() && irq_trace())
+		start_critical_timing(a0, a1);
+}
+
+#else /* !CONFIG_PROVE_LOCKING */
+
+/*
+ * Stubs:
+ */
+
+void early_boot_irqs_off(void)
+{
+}
+
+void early_boot_irqs_on(void)
+{
+}
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void trace_hardirqs_on(void)
+{
+	if (!preempt_trace() && irq_trace())
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+	if (!preempt_trace() && irq_trace())
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+	if (!preempt_trace() && irq_trace())
+		stop_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+	if (!preempt_trace() && irq_trace())
+		start_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+#endif /* CONFIG_PROVE_LOCKING */
+#endif /*  CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+	stop_critical_timing(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+	start_critical_timing(a0, a1);
+}
+#endif /* CONFIG_PREEMPT_TRACER */
+
+static void start_irqsoff_tracer(struct trace_array *tr)
+{
+	register_ftrace_function(&trace_ops);
+	tracer_enabled = 1;
+}
+
+static void stop_irqsoff_tracer(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+	unregister_ftrace_function(&trace_ops);
+}
+
+static void __irqsoff_tracer_init(struct trace_array *tr)
+{
+	irqsoff_trace = tr;
+	/* make sure that the tracer is visible */
+	smp_wmb();
+
+	if (tr->ctrl)
+		start_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_irqsoff_tracer(tr);
+	else
+		stop_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_open(struct trace_iterator *iter)
+{
+	/* stop the trace while dumping */
+	if (iter->tr->ctrl)
+		stop_irqsoff_tracer(iter->tr);
+}
+
+static void irqsoff_tracer_close(struct trace_iterator *iter)
+{
+	if (iter->tr->ctrl)
+		start_irqsoff_tracer(iter->tr);
+}
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static void irqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+static struct tracer irqsoff_tracer __read_mostly =
+{
+	.name		= "irqsoff",
+	.init		= irqsoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_irqsoff,
+#endif
+};
+# define register_irqsoff(trace) register_tracer(&trace)
+#else
+# define register_irqsoff(trace) do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+static void preemptoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptoff_tracer __read_mostly =
+{
+	.name		= "preemptoff",
+	.init		= preemptoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_preemptoff,
+#endif
+};
+# define register_preemptoff(trace) register_tracer(&trace)
+#else
+# define register_preemptoff(trace) do { } while (0)
+#endif
+
+#if defined(CONFIG_IRQSOFF_TRACER) && \
+	defined(CONFIG_PREEMPT_TRACER)
+
+static void preemptirqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptirqsoff_tracer __read_mostly =
+{
+	.name		= "preemptirqsoff",
+	.init		= preemptirqsoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_preemptirqsoff,
+#endif
+};
+
+# define register_preemptirqsoff(trace) register_tracer(&trace)
+#else
+# define register_preemptirqsoff(trace) do { } while (0)
+#endif
+
+__init static int init_irqsoff_tracer(void)
+{
+	register_irqsoff(irqsoff_tracer);
+	register_preemptoff(preemptoff_tracer);
+	register_preemptirqsoff(preemptirqsoff_tracer);
+
+	return 0;
+}
+device_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
new file mode 100644
index 000000000000..b13dc19dcbb4
--- /dev/null
+++ b/kernel/trace/trace_mmiotrace.c
@@ -0,0 +1,295 @@
+/*
+ * Memory mapped I/O tracing
+ *
+ * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#define DEBUG 1
+
+#include <linux/kernel.h>
+#include <linux/mmiotrace.h>
+#include <linux/pci.h>
+
+#include "trace.h"
+
+struct header_iter {
+	struct pci_dev *dev;
+};
+
+static struct trace_array *mmio_trace_array;
+static bool overrun_detected;
+
+static void mmio_reset_data(struct trace_array *tr)
+{
+	int cpu;
+
+	overrun_detected = false;
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
+
+static void mmio_trace_init(struct trace_array *tr)
+{
+	pr_debug("in %s\n", __func__);
+	mmio_trace_array = tr;
+	if (tr->ctrl) {
+		mmio_reset_data(tr);
+		enable_mmiotrace();
+	}
+}
+
+static void mmio_trace_reset(struct trace_array *tr)
+{
+	pr_debug("in %s\n", __func__);
+	if (tr->ctrl)
+		disable_mmiotrace();
+	mmio_reset_data(tr);
+	mmio_trace_array = NULL;
+}
+
+static void mmio_trace_ctrl_update(struct trace_array *tr)
+{
+	pr_debug("in %s\n", __func__);
+	if (tr->ctrl) {
+		mmio_reset_data(tr);
+		enable_mmiotrace();
+	} else {
+		disable_mmiotrace();
+	}
+}
+
+static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+{
+	int ret = 0;
+	int i;
+	resource_size_t start, end;
+	const struct pci_driver *drv = pci_dev_driver(dev);
+
+	/* XXX: incomplete checks for trace_seq_printf() return value */
+	ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+				dev->bus->number, dev->devfn,
+				dev->vendor, dev->device, dev->irq);
+	/*
+	 * XXX: is pci_resource_to_user() appropriate, since we are
+	 * supposed to interpret the __ioremap() phys_addr argument based on
+	 * these printed values?
+	 */
+	for (i = 0; i < 7; i++) {
+		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+		ret += trace_seq_printf(s, " %llx",
+			(unsigned long long)(start |
+			(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
+	}
+	for (i = 0; i < 7; i++) {
+		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+		ret += trace_seq_printf(s, " %llx",
+			dev->resource[i].start < dev->resource[i].end ?
+			(unsigned long long)(end - start) + 1 : 0);
+	}
+	if (drv)
+		ret += trace_seq_printf(s, " %s\n", drv->name);
+	else
+		ret += trace_seq_printf(s, " \n");
+	return ret;
+}
+
+static void destroy_header_iter(struct header_iter *hiter)
+{
+	if (!hiter)
+		return;
+	pci_dev_put(hiter->dev);
+	kfree(hiter);
+}
+
+static void mmio_pipe_open(struct trace_iterator *iter)
+{
+	struct header_iter *hiter;
+	struct trace_seq *s = &iter->seq;
+
+	trace_seq_printf(s, "VERSION 20070824\n");
+
+	hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
+	if (!hiter)
+		return;
+
+	hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL);
+	iter->private = hiter;
+}
+
+/* XXX: This is not called when the pipe is closed! */
+static void mmio_close(struct trace_iterator *iter)
+{
+	struct header_iter *hiter = iter->private;
+	destroy_header_iter(hiter);
+	iter->private = NULL;
+}
+
+static unsigned long count_overruns(struct trace_iterator *iter)
+{
+	int cpu;
+	unsigned long cnt = 0;
+	for_each_online_cpu(cpu) {
+		cnt += iter->overrun[cpu];
+		iter->overrun[cpu] = 0;
+	}
+	return cnt;
+}
+
+static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp,
+				char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	ssize_t ret;
+	struct header_iter *hiter = iter->private;
+	struct trace_seq *s = &iter->seq;
+	unsigned long n;
+
+	n = count_overruns(iter);
+	if (n) {
+		/* XXX: This is later than where events were lost. */
+		trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n);
+		if (!overrun_detected)
+			pr_warning("mmiotrace has lost events.\n");
+		overrun_detected = true;
+		goto print_out;
+	}
+
+	if (!hiter)
+		return 0;
+
+	mmio_print_pcidev(s, hiter->dev);
+	hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev);
+
+	if (!hiter->dev) {
+		destroy_header_iter(hiter);
+		iter->private = NULL;
+	}
+
+print_out:
+	ret = trace_seq_to_user(s, ubuf, cnt);
+	return (ret == -EBUSY) ? 0 : ret;
+}
+
+static int mmio_print_rw(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct mmiotrace_rw *rw	= &entry->mmiorw;
+	struct trace_seq *s	= &iter->seq;
+	unsigned long long t	= ns2usecs(entry->t);
+	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned secs		= (unsigned long)t;
+	int ret = 1;
+
+	switch (entry->mmiorw.opcode) {
+	case MMIO_READ:
+		ret = trace_seq_printf(s,
+			"R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id,
+			(unsigned long long)rw->phys,
+			rw->value, rw->pc, 0);
+		break;
+	case MMIO_WRITE:
+		ret = trace_seq_printf(s,
+			"W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id,
+			(unsigned long long)rw->phys,
+			rw->value, rw->pc, 0);
+		break;
+	case MMIO_UNKNOWN_OP:
+		ret = trace_seq_printf(s,
+			"UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
+			secs, usec_rem, rw->map_id,
+			(unsigned long long)rw->phys,
+			(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
+			(rw->value >> 0) & 0xff, rw->pc, 0);
+		break;
+	default:
+		ret = trace_seq_printf(s, "rw what?\n");
+		break;
+	}
+	if (ret)
+		return 1;
+	return 0;
+}
+
+static int mmio_print_map(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct mmiotrace_map *m	= &entry->mmiomap;
+	struct trace_seq *s	= &iter->seq;
+	unsigned long long t	= ns2usecs(entry->t);
+	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned secs		= (unsigned long)t;
+	int ret = 1;
+
+	switch (entry->mmiorw.opcode) {
+	case MMIO_PROBE:
+		ret = trace_seq_printf(s,
+			"MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
+			secs, usec_rem, m->map_id,
+			(unsigned long long)m->phys, m->virt, m->len,
+			0UL, 0);
+		break;
+	case MMIO_UNPROBE:
+		ret = trace_seq_printf(s,
+			"UNMAP %lu.%06lu %d 0x%lx %d\n",
+			secs, usec_rem, m->map_id, 0UL, 0);
+		break;
+	default:
+		ret = trace_seq_printf(s, "map what?\n");
+		break;
+	}
+	if (ret)
+		return 1;
+	return 0;
+}
+
+/* return 0 to abort printing without consuming current entry in pipe mode */
+static int mmio_print_line(struct trace_iterator *iter)
+{
+	switch (iter->ent->type) {
+	case TRACE_MMIO_RW:
+		return mmio_print_rw(iter);
+	case TRACE_MMIO_MAP:
+		return mmio_print_map(iter);
+	default:
+		return 1; /* ignore unknown entries */
+	}
+}
+
+static struct tracer mmio_tracer __read_mostly =
+{
+	.name		= "mmiotrace",
+	.init		= mmio_trace_init,
+	.reset		= mmio_trace_reset,
+	.pipe_open	= mmio_pipe_open,
+	.close		= mmio_close,
+	.read		= mmio_read,
+	.ctrl_update	= mmio_trace_ctrl_update,
+	.print_line	= mmio_print_line,
+};
+
+__init static int init_mmio_trace(void)
+{
+	return register_tracer(&mmio_tracer);
+}
+device_initcall(init_mmio_trace);
+
+void mmio_trace_rw(struct mmiotrace_rw *rw)
+{
+	struct trace_array *tr = mmio_trace_array;
+	struct trace_array_cpu *data = tr->data[smp_processor_id()];
+	__trace_mmiotrace_rw(tr, data, rw);
+}
+
+void mmio_trace_mapping(struct mmiotrace_map *map)
+{
+	struct trace_array *tr = mmio_trace_array;
+	struct trace_array_cpu *data;
+
+	preempt_disable();
+	data = tr->data[smp_processor_id()];
+	__trace_mmiotrace_map(tr, data, map);
+	preempt_enable();
+}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
new file mode 100644
index 000000000000..c16935d3bc5c
--- /dev/null
+++ b/kernel/trace/trace_sched_switch.c
@@ -0,0 +1,286 @@
+/*
+ * trace context switch
+ *
+ * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/marker.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static struct trace_array	*ctx_trace;
+static int __read_mostly	tracer_enabled;
+static atomic_t			sched_ref;
+
+static void
+sched_switch_func(void *private, void *__rq, struct task_struct *prev,
+			struct task_struct *next)
+{
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	tracing_record_cmdline(prev);
+	tracing_record_cmdline(next);
+
+	if (!tracer_enabled)
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		tracing_sched_switch_trace(tr, data, prev, next, flags);
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+		      const char *format, va_list *args)
+{
+	struct task_struct *prev;
+	struct task_struct *next;
+	struct rq *__rq;
+
+	if (!atomic_read(&sched_ref))
+		return;
+
+	/* skip prev_pid %d next_pid %d prev_state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	__rq = va_arg(*args, typeof(__rq));
+	prev = va_arg(*args, typeof(prev));
+	next = va_arg(*args, typeof(next));
+
+	/*
+	 * If tracer_switch_func only points to the local
+	 * switch func, it still needs the ptr passed to it.
+	 */
+	sched_switch_func(probe_data, __rq, prev, next);
+}
+
+static void
+wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct
+			task_struct *curr)
+{
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	tracing_record_cmdline(curr);
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+static notrace void
+wake_up_callback(void *probe_data, void *call_data,
+		 const char *format, va_list *args)
+{
+	struct task_struct *curr;
+	struct task_struct *task;
+	struct rq *__rq;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	/* Skip pid %d state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	/* now get the meat: "rq %p task %p rq->curr %p" */
+	__rq = va_arg(*args, typeof(__rq));
+	task = va_arg(*args, typeof(task));
+	curr = va_arg(*args, typeof(curr));
+
+	tracing_record_cmdline(task);
+	tracing_record_cmdline(curr);
+
+	wakeup_func(probe_data, __rq, task, curr);
+}
+
+static void sched_switch_reset(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
+
+static int tracing_sched_register(void)
+{
+	int ret;
+
+	ret = marker_probe_register("kernel_sched_wakeup",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&ctx_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup\n");
+		return ret;
+	}
+
+	ret = marker_probe_register("kernel_sched_wakeup_new",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&ctx_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup_new\n");
+		goto fail_deprobe;
+	}
+
+	ret = marker_probe_register("kernel_sched_schedule",
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		sched_switch_callback,
+		&ctx_trace);
+	if (ret) {
+		pr_info("sched trace: Couldn't add marker"
+			" probe to kernel_sched_schedule\n");
+		goto fail_deprobe_wake_new;
+	}
+
+	return ret;
+fail_deprobe_wake_new:
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&ctx_trace);
+fail_deprobe:
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&ctx_trace);
+	return ret;
+}
+
+static void tracing_sched_unregister(void)
+{
+	marker_probe_unregister("kernel_sched_schedule",
+				sched_switch_callback,
+				&ctx_trace);
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&ctx_trace);
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&ctx_trace);
+}
+
+void tracing_start_sched_switch(void)
+{
+	long ref;
+
+	ref = atomic_inc_return(&sched_ref);
+	if (ref == 1)
+		tracing_sched_register();
+}
+
+void tracing_stop_sched_switch(void)
+{
+	long ref;
+
+	ref = atomic_dec_and_test(&sched_ref);
+	if (ref)
+		tracing_sched_unregister();
+}
+
+void tracing_start_cmdline_record(void)
+{
+	tracing_start_sched_switch();
+}
+
+void tracing_stop_cmdline_record(void)
+{
+	tracing_stop_sched_switch();
+}
+
+static void start_sched_trace(struct trace_array *tr)
+{
+	sched_switch_reset(tr);
+	tracer_enabled = 1;
+	tracing_start_cmdline_record();
+}
+
+static void stop_sched_trace(struct trace_array *tr)
+{
+	tracing_stop_cmdline_record();
+	tracer_enabled = 0;
+}
+
+static void sched_switch_trace_init(struct trace_array *tr)
+{
+	ctx_trace = tr;
+
+	if (tr->ctrl)
+		start_sched_trace(tr);
+}
+
+static void sched_switch_trace_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_sched_trace(tr);
+}
+
+static void sched_switch_trace_ctrl_update(struct trace_array *tr)
+{
+	/* When starting a new trace, reset the buffers */
+	if (tr->ctrl)
+		start_sched_trace(tr);
+	else
+		stop_sched_trace(tr);
+}
+
+static struct tracer sched_switch_trace __read_mostly =
+{
+	.name		= "sched_switch",
+	.init		= sched_switch_trace_init,
+	.reset		= sched_switch_trace_reset,
+	.ctrl_update	= sched_switch_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_sched_switch,
+#endif
+};
+
+__init static int init_sched_switch_trace(void)
+{
+	int ret = 0;
+
+	if (atomic_read(&sched_ref))
+		ret = tracing_sched_register();
+	if (ret) {
+		pr_info("error registering scheduler trace\n");
+		return ret;
+	}
+	return register_tracer(&sched_switch_trace);
+}
+device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
new file mode 100644
index 000000000000..bf7e91caef57
--- /dev/null
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -0,0 +1,447 @@
+/*
+ * trace task wakeup timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/marker.h>
+
+#include "trace.h"
+
+static struct trace_array	*wakeup_trace;
+static int __read_mostly	tracer_enabled;
+
+static struct task_struct	*wakeup_task;
+static int			wakeup_cpu;
+static unsigned			wakeup_prio = -1;
+
+static DEFINE_SPINLOCK(wakeup_lock);
+
+static void __wakeup_reset(struct trace_array *tr);
+
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = wakeup_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int resched;
+	int cpu;
+
+	if (likely(!wakeup_task))
+		return;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+	if (unlikely(disabled != 1))
+		goto out;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+
+	if (unlikely(!wakeup_task))
+		goto unlock;
+
+	/*
+	 * The task can't disappear because it needs to
+	 * wake up first, and we have the wakeup_lock.
+	 */
+	if (task_cpu(wakeup_task) != cpu)
+		goto unlock;
+
+	trace_function(tr, data, ip, parent_ip, flags);
+
+ unlock:
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+
+ out:
+	atomic_dec(&data->disabled);
+
+	/*
+	 * To prevent recursion from the scheduler, if the
+	 * resched flag was set before we entered, then
+	 * don't reschedule.
+	 */
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = wakeup_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycle_t delta)
+{
+	if (tracing_thresh) {
+		if (delta < tracing_thresh)
+			return 0;
+	} else {
+		if (delta <= tracing_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+static void notrace
+wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
+	struct task_struct *next)
+{
+	unsigned long latency = 0, t0 = 0, t1 = 0;
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
+	struct trace_array_cpu *data;
+	cycle_t T0, T1, delta;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (unlikely(!tracer_enabled))
+		return;
+
+	/*
+	 * When we start a new trace, we set wakeup_task to NULL
+	 * and then set tracer_enabled = 1. We want to make sure
+	 * that another CPU does not see the tracer_enabled = 1
+	 * and the wakeup_task with an older task, that might
+	 * actually be the same as next.
+	 */
+	smp_rmb();
+
+	if (next != wakeup_task)
+		return;
+
+	/* The task we are waiting for is waking up */
+	data = tr->data[wakeup_cpu];
+
+	/* disable local data, not wakeup_cpu data */
+	cpu = raw_smp_processor_id();
+	disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+	if (likely(disabled != 1))
+		goto out;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+
+	/* We could race with grabbing wakeup_lock */
+	if (unlikely(!tracer_enabled || next != wakeup_task))
+		goto out_unlock;
+
+	trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags);
+
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = data->preempt_timestamp;
+	T1 = ftrace_now(cpu);
+	delta = T1-T0;
+
+	if (!report_latency(delta))
+		goto out_unlock;
+
+	latency = nsecs_to_usecs(delta);
+
+	tracing_max_latency = delta;
+	t0 = nsecs_to_usecs(T0);
+	t1 = nsecs_to_usecs(T1);
+
+	update_max_tr(tr, wakeup_task, wakeup_cpu);
+
+out_unlock:
+	__wakeup_reset(tr);
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+out:
+	atomic_dec(&tr->data[cpu]->disabled);
+}
+
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+		      const char *format, va_list *args)
+{
+	struct task_struct *prev;
+	struct task_struct *next;
+	struct rq *__rq;
+
+	/* skip prev_pid %d next_pid %d prev_state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	__rq = va_arg(*args, typeof(__rq));
+	prev = va_arg(*args, typeof(prev));
+	next = va_arg(*args, typeof(next));
+
+	tracing_record_cmdline(prev);
+
+	/*
+	 * If tracer_switch_func only points to the local
+	 * switch func, it still needs the ptr passed to it.
+	 */
+	wakeup_sched_switch(probe_data, __rq, prev, next);
+}
+
+static void __wakeup_reset(struct trace_array *tr)
+{
+	struct trace_array_cpu *data;
+	int cpu;
+
+	assert_spin_locked(&wakeup_lock);
+
+	for_each_possible_cpu(cpu) {
+		data = tr->data[cpu];
+		tracing_reset(data);
+	}
+
+	wakeup_cpu = -1;
+	wakeup_prio = -1;
+
+	if (wakeup_task)
+		put_task_struct(wakeup_task);
+
+	wakeup_task = NULL;
+}
+
+static void wakeup_reset(struct trace_array *tr)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+	__wakeup_reset(tr);
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+}
+
+static void
+wakeup_check_start(struct trace_array *tr, struct task_struct *p,
+		   struct task_struct *curr)
+{
+	int cpu = smp_processor_id();
+	unsigned long flags;
+	long disabled;
+
+	if (likely(!rt_task(p)) ||
+			p->prio >= wakeup_prio ||
+			p->prio >= curr->prio)
+		return;
+
+	disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+	if (unlikely(disabled != 1))
+		goto out;
+
+	/* interrupts should be off from try_to_wake_up */
+	spin_lock(&wakeup_lock);
+
+	/* check for races. */
+	if (!tracer_enabled || p->prio >= wakeup_prio)
+		goto out_locked;
+
+	/* reset the trace */
+	__wakeup_reset(tr);
+
+	wakeup_cpu = task_cpu(p);
+	wakeup_prio = p->prio;
+
+	wakeup_task = p;
+	get_task_struct(wakeup_task);
+
+	local_save_flags(flags);
+
+	tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
+	trace_function(tr, tr->data[wakeup_cpu],
+		       CALLER_ADDR1, CALLER_ADDR2, flags);
+
+out_locked:
+	spin_unlock(&wakeup_lock);
+out:
+	atomic_dec(&tr->data[cpu]->disabled);
+}
+
+static notrace void
+wake_up_callback(void *probe_data, void *call_data,
+		 const char *format, va_list *args)
+{
+	struct trace_array **ptr = probe_data;
+	struct trace_array *tr = *ptr;
+	struct task_struct *curr;
+	struct task_struct *task;
+	struct rq *__rq;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	/* Skip pid %d state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	/* now get the meat: "rq %p task %p rq->curr %p" */
+	__rq = va_arg(*args, typeof(__rq));
+	task = va_arg(*args, typeof(task));
+	curr = va_arg(*args, typeof(curr));
+
+	tracing_record_cmdline(task);
+	tracing_record_cmdline(curr);
+
+	wakeup_check_start(tr, task, curr);
+}
+
+static void start_wakeup_tracer(struct trace_array *tr)
+{
+	int ret;
+
+	ret = marker_probe_register("kernel_sched_wakeup",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&wakeup_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup\n");
+		return;
+	}
+
+	ret = marker_probe_register("kernel_sched_wakeup_new",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&wakeup_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup_new\n");
+		goto fail_deprobe;
+	}
+
+	ret = marker_probe_register("kernel_sched_schedule",
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		sched_switch_callback,
+		&wakeup_trace);
+	if (ret) {
+		pr_info("sched trace: Couldn't add marker"
+			" probe to kernel_sched_schedule\n");
+		goto fail_deprobe_wake_new;
+	}
+
+	wakeup_reset(tr);
+
+	/*
+	 * Don't let the tracer_enabled = 1 show up before
+	 * the wakeup_task is reset. This may be overkill since
+	 * wakeup_reset does a spin_unlock after setting the
+	 * wakeup_task to NULL, but I want to be safe.
+	 * This is a slow path anyway.
+	 */
+	smp_wmb();
+
+	tracer_enabled = 1;
+	register_ftrace_function(&trace_ops);
+
+	return;
+fail_deprobe_wake_new:
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&wakeup_trace);
+fail_deprobe:
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&wakeup_trace);
+}
+
+static void stop_wakeup_tracer(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+	unregister_ftrace_function(&trace_ops);
+	marker_probe_unregister("kernel_sched_schedule",
+				sched_switch_callback,
+				&wakeup_trace);
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&wakeup_trace);
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&wakeup_trace);
+}
+
+static void wakeup_tracer_init(struct trace_array *tr)
+{
+	wakeup_trace = tr;
+
+	if (tr->ctrl)
+		start_wakeup_tracer(tr);
+}
+
+static void wakeup_tracer_reset(struct trace_array *tr)
+{
+	if (tr->ctrl) {
+		stop_wakeup_tracer(tr);
+		/* make sure we put back any tasks we are tracing */
+		wakeup_reset(tr);
+	}
+}
+
+static void wakeup_tracer_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_wakeup_tracer(tr);
+	else
+		stop_wakeup_tracer(tr);
+}
+
+static void wakeup_tracer_open(struct trace_iterator *iter)
+{
+	/* stop the trace while dumping */
+	if (iter->tr->ctrl)
+		stop_wakeup_tracer(iter->tr);
+}
+
+static void wakeup_tracer_close(struct trace_iterator *iter)
+{
+	/* forget about any processes we were recording */
+	if (iter->tr->ctrl)
+		start_wakeup_tracer(iter->tr);
+}
+
+static struct tracer wakeup_tracer __read_mostly =
+{
+	.name		= "wakeup",
+	.init		= wakeup_tracer_init,
+	.reset		= wakeup_tracer_reset,
+	.open		= wakeup_tracer_open,
+	.close		= wakeup_tracer_close,
+	.ctrl_update	= wakeup_tracer_ctrl_update,
+	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_wakeup,
+#endif
+};
+
+__init static int init_wakeup_tracer(void)
+{
+	int ret;
+
+	ret = register_tracer(&wakeup_tracer);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+device_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
new file mode 100644
index 000000000000..0911b7e073bf
--- /dev/null
+++ b/kernel/trace/trace_selftest.c
@@ -0,0 +1,563 @@
+/* Include in trace.c */
+
+#include <linux/kthread.h>
+#include <linux/delay.h>
+
+static inline int trace_valid_entry(struct trace_entry *entry)
+{
+	switch (entry->type) {
+	case TRACE_FN:
+	case TRACE_CTX:
+	case TRACE_WAKE:
+	case TRACE_STACK:
+	case TRACE_SPECIAL:
+		return 1;
+	}
+	return 0;
+}
+
+static int
+trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
+{
+	struct trace_entry *entries;
+	struct page *page;
+	int idx = 0;
+	int i;
+
+	BUG_ON(list_empty(&data->trace_pages));
+	page = list_entry(data->trace_pages.next, struct page, lru);
+	entries = page_address(page);
+
+	check_pages(data);
+	if (head_page(data) != entries)
+		goto failed;
+
+	/*
+	 * The starting trace buffer always has valid elements,
+	 * if any element exists.
+	 */
+	entries = head_page(data);
+
+	for (i = 0; i < tr->entries; i++) {
+
+		if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) {
+			printk(KERN_CONT ".. invalid entry %d ",
+				entries[idx].type);
+			goto failed;
+		}
+
+		idx++;
+		if (idx >= ENTRIES_PER_PAGE) {
+			page = virt_to_page(entries);
+			if (page->lru.next == &data->trace_pages) {
+				if (i != tr->entries - 1) {
+					printk(KERN_CONT ".. entries buffer mismatch");
+					goto failed;
+				}
+			} else {
+				page = list_entry(page->lru.next, struct page, lru);
+				entries = page_address(page);
+			}
+			idx = 0;
+		}
+	}
+
+	page = virt_to_page(entries);
+	if (page->lru.next != &data->trace_pages) {
+		printk(KERN_CONT ".. too many entries");
+		goto failed;
+	}
+
+	return 0;
+
+ failed:
+	/* disable tracing */
+	tracing_disabled = 1;
+	printk(KERN_CONT ".. corrupted trace buffer .. ");
+	return -1;
+}
+
+/*
+ * Test the trace buffer to see if all the elements
+ * are still sane.
+ */
+static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
+{
+	unsigned long flags, cnt = 0;
+	int cpu, ret = 0;
+
+	/* Don't allow flipping of max traces now */
+	raw_local_irq_save(flags);
+	__raw_spin_lock(&ftrace_max_lock);
+	for_each_possible_cpu(cpu) {
+		if (!head_page(tr->data[cpu]))
+			continue;
+
+		cnt += tr->data[cpu]->trace_idx;
+
+		ret = trace_test_buffer_cpu(tr, tr->data[cpu]);
+		if (ret)
+			break;
+	}
+	__raw_spin_unlock(&ftrace_max_lock);
+	raw_local_irq_restore(flags);
+
+	if (count)
+		*count = cnt;
+
+	return ret;
+}
+
+#ifdef CONFIG_FTRACE
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#define __STR(x) #x
+#define STR(x) __STR(x)
+
+/* Test dynamic code modification and ftrace filters */
+int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
+					   struct trace_array *tr,
+					   int (*func)(void))
+{
+	unsigned long count;
+	int ret;
+	int save_ftrace_enabled = ftrace_enabled;
+	int save_tracer_enabled = tracer_enabled;
+	char *func_name;
+
+	/* The ftrace test PASSED */
+	printk(KERN_CONT "PASSED\n");
+	pr_info("Testing dynamic ftrace: ");
+
+	/* enable tracing, and record the filter function */
+	ftrace_enabled = 1;
+	tracer_enabled = 1;
+
+	/* passed in by parameter to fool gcc from optimizing */
+	func();
+
+	/* update the records */
+	ret = ftrace_force_update();
+	if (ret) {
+		printk(KERN_CONT ".. ftraced failed .. ");
+		return ret;
+	}
+
+	/*
+	 * Some archs *cough*PowerPC*cough* add charachters to the
+	 * start of the function names. We simply put a '*' to
+	 * accomodate them.
+	 */
+	func_name = "*" STR(DYN_FTRACE_TEST_NAME);
+
+	/* filter only on our function */
+	ftrace_set_filter(func_name, strlen(func_name), 1);
+
+	/* enable tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+
+	/* we should have nothing in the buffer */
+	ret = trace_test_buffer(tr, &count);
+	if (ret)
+		goto out;
+
+	if (count) {
+		ret = -1;
+		printk(KERN_CONT ".. filter did not filter .. ");
+		goto out;
+	}
+
+	/* call our function again */
+	func();
+
+	/* sleep again */
+	msleep(100);
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	ftrace_enabled = 0;
+
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	/* we should only have one item */
+	if (!ret && count != 1) {
+		printk(KERN_CONT ".. filter failed count=%ld ..", count);
+		ret = -1;
+		goto out;
+	}
+ out:
+	ftrace_enabled = save_ftrace_enabled;
+	tracer_enabled = save_tracer_enabled;
+
+	/* Enable tracing on all functions again */
+	ftrace_set_filter(NULL, 0, 1);
+
+	return ret;
+}
+#else
+# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
+#endif /* CONFIG_DYNAMIC_FTRACE */
+/*
+ * Simple verification test of ftrace function tracer.
+ * Enable ftrace, sleep 1/10 second, and then read the trace
+ * buffer to see if all is in order.
+ */
+int
+trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+	int save_ftrace_enabled = ftrace_enabled;
+	int save_tracer_enabled = tracer_enabled;
+
+	/* make sure msleep has been recorded */
+	msleep(1);
+
+	/* force the recorded functions to be traced */
+	ret = ftrace_force_update();
+	if (ret) {
+		printk(KERN_CONT ".. ftraced failed .. ");
+		return ret;
+	}
+
+	/* start the tracing */
+	ftrace_enabled = 1;
+	tracer_enabled = 1;
+
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	ftrace_enabled = 0;
+
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+		goto out;
+	}
+
+	ret = trace_selftest_startup_dynamic_tracing(trace, tr,
+						     DYN_FTRACE_TEST_NAME);
+
+ out:
+	ftrace_enabled = save_ftrace_enabled;
+	tracer_enabled = save_tracer_enabled;
+
+	/* kill ftrace totally if we failed */
+	if (ret)
+		ftrace_kill();
+
+	return ret;
+}
+#endif /* CONFIG_FTRACE */
+
+#ifdef CONFIG_IRQSOFF_TRACER
+int
+trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* reset the max latency */
+	tracing_max_latency = 0;
+	/* disable interrupts for a bit */
+	local_irq_disable();
+	udelay(100);
+	local_irq_enable();
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (!ret)
+		ret = trace_test_buffer(&max_tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	tracing_max_latency = save_max;
+
+	return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+int
+trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* reset the max latency */
+	tracing_max_latency = 0;
+	/* disable preemption for a bit */
+	preempt_disable();
+	udelay(100);
+	preempt_enable();
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (!ret)
+		ret = trace_test_buffer(&max_tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	tracing_max_latency = save_max;
+
+	return ret;
+}
+#endif /* CONFIG_PREEMPT_TRACER */
+
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
+int
+trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+
+	/* reset the max latency */
+	tracing_max_latency = 0;
+
+	/* disable preemption and interrupts for a bit */
+	preempt_disable();
+	local_irq_disable();
+	udelay(100);
+	preempt_enable();
+	/* reverse the order of preempt vs irqs */
+	local_irq_enable();
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (ret)
+		goto out;
+
+	ret = trace_test_buffer(&max_tr, &count);
+	if (ret)
+		goto out;
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+		goto out;
+	}
+
+	/* do the test by disabling interrupts first this time */
+	tracing_max_latency = 0;
+	tr->ctrl = 1;
+	trace->ctrl_update(tr);
+	preempt_disable();
+	local_irq_disable();
+	udelay(100);
+	preempt_enable();
+	/* reverse the order of preempt vs irqs */
+	local_irq_enable();
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (ret)
+		goto out;
+
+	ret = trace_test_buffer(&max_tr, &count);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+		goto out;
+	}
+
+ out:
+	trace->reset(tr);
+	tracing_max_latency = save_max;
+
+	return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */
+
+#ifdef CONFIG_SCHED_TRACER
+static int trace_wakeup_test_thread(void *data)
+{
+	/* Make this a RT thread, doesn't need to be too high */
+	struct sched_param param = { .sched_priority = 5 };
+	struct completion *x = data;
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+
+	/* Make it know we have a new prio */
+	complete(x);
+
+	/* now go to sleep and let the test wake us up */
+	set_current_state(TASK_INTERRUPTIBLE);
+	schedule();
+
+	/* we are awake, now wait to disappear */
+	while (!kthread_should_stop()) {
+		/*
+		 * This is an RT task, do short sleeps to let
+		 * others run.
+		 */
+		msleep(100);
+	}
+
+	return 0;
+}
+
+int
+trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	struct task_struct *p;
+	struct completion isrt;
+	unsigned long count;
+	int ret;
+
+	init_completion(&isrt);
+
+	/* create a high prio thread */
+	p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
+	if (IS_ERR(p)) {
+		printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
+		return -1;
+	}
+
+	/* make sure the thread is running at an RT prio */
+	wait_for_completion(&isrt);
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* reset the max latency */
+	tracing_max_latency = 0;
+
+	/* sleep to let the RT thread sleep too */
+	msleep(100);
+
+	/*
+	 * Yes this is slightly racy. It is possible that for some
+	 * strange reason that the RT thread we created, did not
+	 * call schedule for 100ms after doing the completion,
+	 * and we do a wakeup on a task that already is awake.
+	 * But that is extremely unlikely, and the worst thing that
+	 * happens in such a case, is that we disable tracing.
+	 * Honestly, if this race does happen something is horrible
+	 * wrong with the system.
+	 */
+
+	wake_up_process(p);
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (!ret)
+		ret = trace_test_buffer(&max_tr, &count);
+
+
+	trace->reset(tr);
+
+	tracing_max_latency = save_max;
+
+	/* kill the thread */
+	kthread_stop(p);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_SCHED_TRACER */
+
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+int
+trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+
+#ifdef CONFIG_SYSPROF_TRACER
+int
+trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	return ret;
+}
+#endif /* CONFIG_SYSPROF_TRACER */
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
new file mode 100644
index 000000000000..54dd77cce5bf
--- /dev/null
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -0,0 +1,7 @@
+#include "trace.h"
+
+int DYN_FTRACE_TEST_NAME(void)
+{
+	/* used to call mcount */
+	return 0;
+}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
new file mode 100644
index 000000000000..2301e1e7c606
--- /dev/null
+++ b/kernel/trace/trace_sysprof.c
@@ -0,0 +1,363 @@
+/*
+ * trace stack traces
+ *
+ * Copyright (C) 2004-2008, Soeren Sandmann
+ * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <linux/hrtimer.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/irq.h>
+#include <linux/fs.h>
+
+#include <asm/stacktrace.h>
+
+#include "trace.h"
+
+static struct trace_array	*sysprof_trace;
+static int __read_mostly	tracer_enabled;
+
+/*
+ * 1 msec sample interval by default:
+ */
+static unsigned long sample_period = 1000000;
+static const unsigned int sample_max_depth = 512;
+
+static DEFINE_MUTEX(sample_timer_lock);
+/*
+ * Per CPU hrtimers that do the profiling:
+ */
+static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
+
+struct stack_frame {
+	const void __user	*next_fp;
+	unsigned long		return_address;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+	int ret;
+
+	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+		return 0;
+
+	ret = 1;
+	pagefault_disable();
+	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+		ret = 0;
+	pagefault_enable();
+
+	return ret;
+}
+
+struct backtrace_info {
+	struct trace_array_cpu	*data;
+	struct trace_array	*tr;
+	int			pos;
+};
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	/* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+	/* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+	/* Don't bother with IRQ stacks for now */
+	return -1;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+	struct backtrace_info *info = data;
+
+	if (info->pos < sample_max_depth && reliable) {
+		__trace_special(info->tr, info->data, 1, addr, 0);
+
+		info->pos++;
+	}
+}
+
+const static struct stacktrace_ops backtrace_ops = {
+	.warning		= backtrace_warning,
+	.warning_symbol		= backtrace_warning_symbol,
+	.stack			= backtrace_stack,
+	.address		= backtrace_address,
+};
+
+static int
+trace_kernel(struct pt_regs *regs, struct trace_array *tr,
+	     struct trace_array_cpu *data)
+{
+	struct backtrace_info info;
+	unsigned long bp;
+	char *stack;
+
+	info.tr = tr;
+	info.data = data;
+	info.pos = 1;
+
+	__trace_special(info.tr, info.data, 1, regs->ip, 0);
+
+	stack = ((char *)regs + sizeof(struct pt_regs));
+#ifdef CONFIG_FRAME_POINTER
+	bp = regs->bp;
+#else
+	bp = 0;
+#endif
+
+	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
+
+	return info.pos;
+}
+
+static void timer_notify(struct pt_regs *regs, int cpu)
+{
+	struct trace_array_cpu *data;
+	struct stack_frame frame;
+	struct trace_array *tr;
+	const void __user *fp;
+	int is_user;
+	int i;
+
+	if (!regs)
+		return;
+
+	tr = sysprof_trace;
+	data = tr->data[cpu];
+	is_user = user_mode(regs);
+
+	if (!current || current->pid == 0)
+		return;
+
+	if (is_user && current->state != TASK_RUNNING)
+		return;
+
+	__trace_special(tr, data, 0, 0, current->pid);
+
+	if (!is_user)
+		i = trace_kernel(regs, tr, data);
+	else
+		i = 0;
+
+	/*
+	 * Trace user stack if we are not a kernel thread
+	 */
+	if (current->mm && i < sample_max_depth) {
+		regs = (struct pt_regs *)current->thread.sp0 - 1;
+
+		fp = (void __user *)regs->bp;
+
+		__trace_special(tr, data, 2, regs->ip, 0);
+
+		while (i < sample_max_depth) {
+			frame.next_fp = 0;
+			frame.return_address = 0;
+			if (!copy_stack_frame(fp, &frame))
+				break;
+			if ((unsigned long)fp < regs->sp)
+				break;
+
+			__trace_special(tr, data, 2, frame.return_address,
+					(unsigned long)fp);
+			fp = frame.next_fp;
+
+			i++;
+		}
+
+	}
+
+	/*
+	 * Special trace entry if we overflow the max depth:
+	 */
+	if (i == sample_max_depth)
+		__trace_special(tr, data, -1, -1, -1);
+
+	__trace_special(tr, data, 3, current->pid, i);
+}
+
+static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
+{
+	/* trace here */
+	timer_notify(get_irq_regs(), smp_processor_id());
+
+	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
+
+	return HRTIMER_RESTART;
+}
+
+static void start_stack_timer(int cpu)
+{
+	struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
+
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer->function = stack_trace_timer_fn;
+	hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+
+	hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
+}
+
+static void start_stack_timers(void)
+{
+	cpumask_t saved_mask = current->cpus_allowed;
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+		start_stack_timer(cpu);
+	}
+	set_cpus_allowed_ptr(current, &saved_mask);
+}
+
+static void stop_stack_timer(int cpu)
+{
+	struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
+
+	hrtimer_cancel(hrtimer);
+}
+
+static void stop_stack_timers(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		stop_stack_timer(cpu);
+}
+
+static void stack_reset(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
+
+static void start_stack_trace(struct trace_array *tr)
+{
+	mutex_lock(&sample_timer_lock);
+	stack_reset(tr);
+	start_stack_timers();
+	tracer_enabled = 1;
+	mutex_unlock(&sample_timer_lock);
+}
+
+static void stop_stack_trace(struct trace_array *tr)
+{
+	mutex_lock(&sample_timer_lock);
+	stop_stack_timers();
+	tracer_enabled = 0;
+	mutex_unlock(&sample_timer_lock);
+}
+
+static void stack_trace_init(struct trace_array *tr)
+{
+	sysprof_trace = tr;
+
+	if (tr->ctrl)
+		start_stack_trace(tr);
+}
+
+static void stack_trace_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_stack_trace(tr);
+}
+
+static void stack_trace_ctrl_update(struct trace_array *tr)
+{
+	/* When starting a new trace, reset the buffers */
+	if (tr->ctrl)
+		start_stack_trace(tr);
+	else
+		stop_stack_trace(tr);
+}
+
+static struct tracer stack_trace __read_mostly =
+{
+	.name		= "sysprof",
+	.init		= stack_trace_init,
+	.reset		= stack_trace_reset,
+	.ctrl_update	= stack_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_sysprof,
+#endif
+};
+
+__init static int init_stack_trace(void)
+{
+	return register_tracer(&stack_trace);
+}
+device_initcall(init_stack_trace);
+
+#define MAX_LONG_DIGITS 22
+
+static ssize_t
+sysprof_sample_read(struct file *filp, char __user *ubuf,
+		    size_t cnt, loff_t *ppos)
+{
+	char buf[MAX_LONG_DIGITS];
+	int r;
+
+	r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+sysprof_sample_write(struct file *filp, const char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	char buf[MAX_LONG_DIGITS];
+	unsigned long val;
+
+	if (cnt > MAX_LONG_DIGITS-1)
+		cnt = MAX_LONG_DIGITS-1;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	val = simple_strtoul(buf, NULL, 10);
+	/*
+	 * Enforce a minimum sample period of 100 usecs:
+	 */
+	if (val < 100)
+		val = 100;
+
+	mutex_lock(&sample_timer_lock);
+	stop_stack_timers();
+	sample_period = val * 1000;
+	start_stack_timers();
+	mutex_unlock(&sample_timer_lock);
+
+	return cnt;
+}
+
+static struct file_operations sysprof_sample_fops = {
+	.read		= sysprof_sample_read,
+	.write		= sysprof_sample_write,
+};
+
+void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
+{
+	struct dentry *entry;
+
+	entry = debugfs_create_file("sysprof_sample_period", 0644,
+			d_tracer, NULL, &sysprof_sample_fops);
+	if (entry)
+		return;
+	pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n");
+}