summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2009-08-18 15:51:43 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2009-08-18 15:51:43 +1000
commitfa461903f7fa480386faaaea233b29d81caca6a5 (patch)
treeae782306fa19199c50ddc161e5d69617a757cfcd /kernel
parent8a2fe3a077671738a784311eeed40c4650b287d1 (diff)
parenteb9c6b238476ba13aa40d0d70db839fbe7d3e97d (diff)
Merge commit 'tip/auto-latest'
Conflicts: arch/x86/include/asm/socket.h include/linux/rcupdate.h kernel/fork.c kernel/trace/trace.h
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/futex.c47
-rw-r--r--kernel/hrtimer.c57
-rw-r--r--kernel/irq/chip.c83
-rw-r--r--kernel/irq/internals.h13
-rw-r--r--kernel/irq/manage.c107
-rw-r--r--kernel/irq/resend.c3
-rw-r--r--kernel/irq/spurious.c1
-rw-r--r--kernel/itimer.c164
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c30
-rw-r--r--kernel/lockdep.c788
-rw-r--r--kernel/lockdep_internals.h2
-rw-r--r--kernel/lockdep_proc.c120
-rw-r--r--kernel/module.c11
-rw-r--r--kernel/perf_counter.c11
-rw-r--r--kernel/posix-cpu-timers.c150
-rw-r--r--kernel/printk.c175
-rw-r--r--kernel/rcuclassic.c807
-rw-r--r--kernel/rcupdate.c41
-rw-r--r--kernel/rcupreempt.c35
-rw-r--r--kernel/rcupreempt_trace.c5
-rw-r--r--kernel/rcutorture.c202
-rw-r--r--kernel/rcutree.c77
-rw-r--r--kernel/sched.c352
-rw-r--r--kernel/sched_cpupri.c30
-rw-r--r--kernel/sched_fair.c30
-rw-r--r--kernel/sched_rt.c56
-rw-r--r--kernel/time/timekeeping.c128
-rw-r--r--kernel/timer.c24
-rw-r--r--kernel/trace/ftrace.c107
-rw-r--r--kernel/trace/kmemtrace.c145
-rw-r--r--kernel/trace/ring_buffer.c940
-rw-r--r--kernel/trace/trace.c222
-rw-r--r--kernel/trace/trace.h43
-rw-r--r--kernel/trace/trace_events.c104
-rw-r--r--kernel/trace/trace_events_filter.c169
-rw-r--r--kernel/trace/trace_export.c6
-rw-r--r--kernel/trace/trace_functions.c4
-rw-r--r--kernel/trace/trace_functions_graph.c164
-rw-r--r--kernel/trace/trace_sched_switch.c57
-rw-r--r--kernel/trace/trace_selftest.c1
-rw-r--r--kernel/trace/trace_stack.c43
-rw-r--r--kernel/trace/trace_stat.c17
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_syscalls.c389
-rw-r--r--kernel/trace/trace_workqueue.c32
-rw-r--r--kernel/tracepoint.c41
-rw-r--r--kernel/workqueue.c7
50 files changed, 3508 insertions, 2548 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 55a6d6aa2395..a00eb1c3a8af 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -81,7 +81,6 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
-obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
obj-$(CONFIG_TREE_RCU) += rcutree.o
obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
diff --git a/kernel/fork.c b/kernel/fork.c
index 2d7d99b7ef45..1008c1c61abd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -63,6 +63,7 @@
#include <linux/magic.h>
#include <linux/perf_counter.h>
#include <linux/kmemleak.h>
+#include <linux/posix-timers.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -795,10 +796,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
thread_group_cputime_init(sig);
/* Expiration times and increments. */
- sig->it_virt_expires = cputime_zero;
- sig->it_virt_incr = cputime_zero;
- sig->it_prof_expires = cputime_zero;
- sig->it_prof_incr = cputime_zero;
+ sig->it[CPUCLOCK_PROF].expires = cputime_zero;
+ sig->it[CPUCLOCK_PROF].incr = cputime_zero;
+ sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
+ sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
/* Cached expiration times. */
sig->cputime_expires.prof_exp = cputime_zero;
diff --git a/kernel/futex.c b/kernel/futex.c
index e18cfbdc7190..248dd119a86e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -115,6 +115,9 @@ struct futex_q {
/* rt_waiter storage for requeue_pi: */
struct rt_mutex_waiter *rt_waiter;
+ /* The expected requeue pi target futex key: */
+ union futex_key *requeue_pi_key;
+
/* Bitset for the optional bitmasked wakeup */
u32 bitset;
};
@@ -1089,6 +1092,10 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
if (!top_waiter)
return 0;
+ /* Ensure we requeue to the expected futex. */
+ if (!match_futex(top_waiter->requeue_pi_key, key2))
+ return -EINVAL;
+
/*
* Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
* the contended case or if set_waiters is 1. The pi_state is returned
@@ -1276,6 +1283,12 @@ retry_private:
continue;
}
+ /* Ensure we requeue to the expected futex for requeue_pi. */
+ if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
+ ret = -EINVAL;
+ break;
+ }
+
/*
* Requeue nr_requeue waiters and possibly one more in the case
* of requeue_pi if we couldn't acquire the lock atomically.
@@ -1751,6 +1764,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
q.pi_state = NULL;
q.bitset = bitset;
q.rt_waiter = NULL;
+ q.requeue_pi_key = NULL;
if (abs_time) {
to = &timeout;
@@ -1858,6 +1872,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
q.pi_state = NULL;
q.rt_waiter = NULL;
+ q.requeue_pi_key = NULL;
retry:
q.key = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -2118,11 +2133,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* We call schedule in futex_wait_queue_me() when we enqueue and return there
* via the following:
* 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
- * 2) wakeup on uaddr2 after a requeue and subsequent unlock
- * 3) signal (before or after requeue)
- * 4) timeout (before or after requeue)
+ * 2) wakeup on uaddr2 after a requeue
+ * 3) signal
+ * 4) timeout
*
- * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
+ * If 3, cleanup and return -ERESTARTNOINTR.
*
* If 2, we may then block on trying to take the rt_mutex and return via:
* 5) successful lock
@@ -2130,7 +2145,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* 7) timeout
* 8) other lock acquisition failure
*
- * If 6, we setup a restart_block with futex_lock_pi() as the function.
+ * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
*
* If 4 or 7, we cleanup and return with -ETIMEDOUT.
*
@@ -2169,15 +2184,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
debug_rt_mutex_init_waiter(&rt_waiter);
rt_waiter.task = NULL;
- q.pi_state = NULL;
- q.bitset = bitset;
- q.rt_waiter = &rt_waiter;
-
key2 = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
+ q.pi_state = NULL;
+ q.bitset = bitset;
+ q.rt_waiter = &rt_waiter;
+ q.requeue_pi_key = &key2;
+
/* Prepare to wait on uaddr. */
ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
if (ret)
@@ -2248,14 +2264,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
rt_mutex_unlock(pi_mutex);
} else if (ret == -EINTR) {
/*
- * We've already been requeued, but we have no way to
- * restart by calling futex_lock_pi() directly. We
- * could restart the syscall, but that will look at
- * the user space value and return right away. So we
- * drop back with EWOULDBLOCK to tell user space that
- * "val" has been changed. That's the same what the
- * restart of the syscall would do in
- * futex_wait_setup().
+ * We've already been requeued, but cannot restart by calling
+ * futex_lock_pi() directly. We could restart this syscall, but
+ * it would detect that the user space "val" changed and return
+ * -EWOULDBLOCK. Save the overhead of the restart and return
+ * -EWOULDBLOCK directly.
*/
ret = -EWOULDBLOCK;
}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 49da79ab8486..e2f91ecc01a8 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -48,37 +48,6 @@
#include <asm/uaccess.h>
-/**
- * ktime_get - get the monotonic time in ktime_t format
- *
- * returns the time in ktime_t format
- */
-ktime_t ktime_get(void)
-{
- struct timespec now;
-
- ktime_get_ts(&now);
-
- return timespec_to_ktime(now);
-}
-EXPORT_SYMBOL_GPL(ktime_get);
-
-/**
- * ktime_get_real - get the real (wall-) time in ktime_t format
- *
- * returns the time in ktime_t format
- */
-ktime_t ktime_get_real(void)
-{
- struct timespec now;
-
- getnstimeofday(&now);
-
- return timespec_to_ktime(now);
-}
-
-EXPORT_SYMBOL_GPL(ktime_get_real);
-
/*
* The timer bases:
*
@@ -106,31 +75,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
}
};
-/**
- * ktime_get_ts - get the monotonic clock in timespec format
- * @ts: pointer to timespec variable
- *
- * The function calculates the monotonic clock from the realtime
- * clock and the wall_to_monotonic offset and stores the result
- * in normalized timespec format in the variable pointed to by @ts.
- */
-void ktime_get_ts(struct timespec *ts)
-{
- struct timespec tomono;
- unsigned long seq;
-
- do {
- seq = read_seqbegin(&xtime_lock);
- getnstimeofday(ts);
- tomono = wall_to_monotonic;
-
- } while (read_seqretry(&xtime_lock, seq));
-
- set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
- ts->tv_nsec + tomono.tv_nsec);
-}
-EXPORT_SYMBOL_GPL(ktime_get_ts);
-
/*
* Get the coarse grained time at the softirq based on xtime and
* wall_to_monotonic.
@@ -1154,7 +1098,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
clock_id = CLOCK_MONOTONIC;
timer->base = &cpu_base->clock_base[clock_id];
- INIT_LIST_HEAD(&timer->cb_entry);
hrtimer_init_timer_hres(timer);
#ifdef CONFIG_TIMER_STATS
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 13c68e71b726..5765aad94998 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -222,6 +222,34 @@ int set_irq_chip_data(unsigned int irq, void *data)
}
EXPORT_SYMBOL(set_irq_chip_data);
+/**
+ * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
+ *
+ * @irq: Interrupt number
+ * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
+ *
+ * The IRQ_NESTED_THREAD flag indicates that on
+ * request_threaded_irq() no separate interrupt thread should be
+ * created for the irq as the handler are called nested in the
+ * context of a demultiplexing interrupt handler thread.
+ */
+void set_irq_nested_thread(unsigned int irq, int nest)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ if (!desc)
+ return;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ if (nest)
+ desc->status |= IRQ_NESTED_THREAD;
+ else
+ desc->status &= ~IRQ_NESTED_THREAD;
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+EXPORT_SYMBOL_GPL(set_irq_nested_thread);
+
/*
* default enable function
*/
@@ -299,6 +327,45 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
}
}
+/*
+ * handle_nested_irq - Handle a nested irq from a irq thread
+ * @irq: the interrupt number
+ *
+ * Handle interrupts which are nested into a threaded interrupt
+ * handler. The handler function is called inside the calling
+ * threads context.
+ */
+void handle_nested_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ irqreturn_t action_ret;
+
+ might_sleep();
+
+ spin_lock_irq(&desc->lock);
+
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ action = desc->action;
+ if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+ goto out_unlock;
+
+ desc->status |= IRQ_INPROGRESS;
+ spin_unlock_irq(&desc->lock);
+
+ action_ret = action->thread_fn(action->irq, action->dev_id);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
+
+ spin_lock_irq(&desc->lock);
+ desc->status &= ~IRQ_INPROGRESS;
+
+out_unlock:
+ spin_unlock_irq(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_nested_irq);
+
/**
* handle_simple_irq - Simple and software-decoded IRQs.
* @irq: the interrupt number
@@ -382,7 +449,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
spin_lock(&desc->lock);
desc->status &= ~IRQ_INPROGRESS;
- if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+
+ if (unlikely(desc->status & IRQ_ONESHOT))
+ desc->status |= IRQ_MASKED;
+ else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
desc->chip->unmask(irq);
out_unlock:
spin_unlock(&desc->lock);
@@ -478,8 +548,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
kstat_incr_irqs_this_cpu(irq, desc);
/* Start handling the irq */
- if (desc->chip->ack)
- desc->chip->ack(irq);
+ if (unlikely(desc->status & IRQ_ONESHOT)) {
+ desc->status |= IRQ_MASKED;
+ mask_ack_irq(desc, irq);
+ } else {
+ if (desc->chip->ack)
+ desc->chip->ack(irq);
+ }
/* Mark the IRQ currently in progress.*/
desc->status |= IRQ_INPROGRESS;
@@ -572,6 +647,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
desc->chip = &dummy_irq_chip;
}
+ chip_bus_lock(irq, desc);
spin_lock_irqsave(&desc->lock, flags);
/* Uninstall? */
@@ -591,6 +667,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
desc->chip->startup(irq);
}
spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(irq, desc);
}
EXPORT_SYMBOL_GPL(__set_irq_handler);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e70ed5592eb9..1b5d742c6a77 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -44,6 +44,19 @@ extern int irq_select_affinity_usr(unsigned int irq);
extern void irq_set_thread_affinity(struct irq_desc *desc);
+/* Inline functions for support of irq chips on slow busses */
+static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
+{
+ if (unlikely(desc->chip->bus_lock))
+ desc->chip->bus_lock(irq);
+}
+
+static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
+{
+ if (unlikely(desc->chip->bus_sync_unlock))
+ desc->chip->bus_sync_unlock(irq);
+}
+
/*
* Debugging printout:
*/
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d222515a5a06..a19cb293731e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -230,9 +230,11 @@ void disable_irq_nosync(unsigned int irq)
if (!desc)
return;
+ chip_bus_lock(irq, desc);
spin_lock_irqsave(&desc->lock, flags);
__disable_irq(desc, irq, false);
spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(irq, desc);
}
EXPORT_SYMBOL(disable_irq_nosync);
@@ -294,7 +296,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
* matches the last disable, processing of interrupts on this
* IRQ line is re-enabled.
*
- * This function may be called from IRQ context.
+ * This function may be called from IRQ context only when
+ * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
*/
void enable_irq(unsigned int irq)
{
@@ -304,9 +307,11 @@ void enable_irq(unsigned int irq)
if (!desc)
return;
+ chip_bus_lock(irq, desc);
spin_lock_irqsave(&desc->lock, flags);
__enable_irq(desc, irq, false);
spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(irq, desc);
}
EXPORT_SYMBOL(enable_irq);
@@ -436,6 +441,26 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
return ret;
}
+/*
+ * Default primary interrupt handler for threaded interrupts. Is
+ * assigned as primary handler when request_threaded_irq is called
+ * with handler == NULL. Useful for oneshot interrupts.
+ */
+static irqreturn_t irq_default_primary_handler(int irq, void *dev_id)
+{
+ return IRQ_WAKE_THREAD;
+}
+
+/*
+ * Primary handler for nested threaded interrupts. Should never be
+ * called.
+ */
+static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
+{
+ WARN(1, "Primary handler called for nested irq %d\n", irq);
+ return IRQ_NONE;
+}
+
static int irq_wait_for_interrupt(struct irqaction *action)
{
while (!kthread_should_stop()) {
@@ -451,6 +476,23 @@ static int irq_wait_for_interrupt(struct irqaction *action)
return -1;
}
+/*
+ * Oneshot interrupts keep the irq line masked until the threaded
+ * handler finished. unmask if the interrupt has not been disabled and
+ * is marked MASKED.
+ */
+static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
+{
+ chip_bus_lock(irq, desc);
+ spin_lock_irq(&desc->lock);
+ if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
+ desc->status &= ~IRQ_MASKED;
+ desc->chip->unmask(irq);
+ }
+ spin_unlock_irq(&desc->lock);
+ chip_bus_sync_unlock(irq, desc);
+}
+
#ifdef CONFIG_SMP
/*
* Check whether we need to change the affinity of the interrupt thread.
@@ -492,7 +534,7 @@ static int irq_thread(void *data)
struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
struct irqaction *action = data;
struct irq_desc *desc = irq_to_desc(action->irq);
- int wake;
+ int wake, oneshot = desc->status & IRQ_ONESHOT;
sched_setscheduler(current, SCHED_FIFO, &param);
current->irqaction = action;
@@ -518,6 +560,9 @@ static int irq_thread(void *data)
spin_unlock_irq(&desc->lock);
action->thread_fn(action->irq, action->dev_id);
+
+ if (oneshot)
+ irq_finalize_oneshot(action->irq, desc);
}
wake = atomic_dec_and_test(&desc->threads_active);
@@ -565,7 +610,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
struct irqaction *old, **old_ptr;
const char *old_name = NULL;
unsigned long flags;
- int shared = 0;
+ int nested, shared = 0;
int ret;
if (!desc)
@@ -590,10 +635,32 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
rand_initialize_irq(irq);
}
+ /* Oneshot interrupts are not allowed with shared */
+ if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
+ return -EINVAL;
+
+ /*
+ * Check whether the interrupt nests into another interrupt
+ * thread.
+ */
+ nested = desc->status & IRQ_NESTED_THREAD;
+ if (nested) {
+ if (!new->thread_fn)
+ return -EINVAL;
+ /*
+ * Replace the primary handler which was provided from
+ * the driver for non nested interrupt handling by the
+ * dummy function which warns when called.
+ */
+ new->handler = irq_nested_primary_handler;
+ }
+
/*
- * Threaded handler ?
+ * Create a handler thread when a thread function is supplied
+ * and the interrupt does not nest into another interrupt
+ * thread.
*/
- if (new->thread_fn) {
+ if (new->thread_fn && !nested) {
struct task_struct *t;
t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
@@ -607,7 +674,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
get_task_struct(t);
new->thread = t;
- wake_up_process(t);
}
/*
@@ -663,9 +729,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
desc->status |= IRQ_PER_CPU;
#endif
- desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
+ desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
+ if (new->flags & IRQF_ONESHOT)
+ desc->status |= IRQ_ONESHOT;
+
if (!(desc->status & IRQ_NOAUTOEN)) {
desc->depth = 0;
desc->status &= ~IRQ_DISABLED;
@@ -690,6 +759,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
(int)(new->flags & IRQF_TRIGGER_MASK));
}
+ new->irq = irq;
*old_ptr = new;
/* Reset broken irq detection when installing new handler */
@@ -707,7 +777,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
spin_unlock_irqrestore(&desc->lock, flags);
- new->irq = irq;
register_irq_proc(irq, desc);
new->dir = NULL;
register_handler_proc(irq, new);
@@ -869,7 +938,14 @@ EXPORT_SYMBOL_GPL(remove_irq);
*/
void free_irq(unsigned int irq, void *dev_id)
{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc)
+ return;
+
+ chip_bus_lock(irq, desc);
kfree(__free_irq(irq, dev_id));
+ chip_bus_sync_unlock(irq, desc);
}
EXPORT_SYMBOL(free_irq);
@@ -878,6 +954,8 @@ EXPORT_SYMBOL(free_irq);
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs.
* Primary handler for threaded interrupts
+ * If NULL and thread_fn != NULL the default
+ * primary handler is installed
* @thread_fn: Function called from the irq handler thread
* If NULL, no irq thread is created
* @irqflags: Interrupt type flags
@@ -957,8 +1035,12 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
if (desc->status & IRQ_NOREQUEST)
return -EINVAL;
- if (!handler)
- return -EINVAL;
+
+ if (!handler) {
+ if (!thread_fn)
+ return -EINVAL;
+ handler = irq_default_primary_handler;
+ }
action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
if (!action)
@@ -970,12 +1052,15 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
action->name = devname;
action->dev_id = dev_id;
+ chip_bus_lock(irq, desc);
retval = __setup_irq(irq, desc, action);
+ chip_bus_sync_unlock(irq, desc);
+
if (retval)
kfree(action);
#ifdef CONFIG_DEBUG_SHIRQ
- if (irqflags & IRQF_SHARED) {
+ if (!retval && (irqflags & IRQF_SHARED)) {
/*
* It's a shared IRQ -- the driver ought to be prepared for it
* to happen immediately, so let's make sure....
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 89c7117acf2b..090c3763f3a2 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -70,8 +70,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
- if (!desc->chip || !desc->chip->retrigger ||
- !desc->chip->retrigger(irq)) {
+ if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
#ifdef CONFIG_HARDIRQS_SW_RESEND
/* Set it pending and activate the softirq: */
set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 4d568294de3e..114e704760fe 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -297,7 +297,6 @@ static int __init irqfixup_setup(char *str)
__setup("irqfixup", irqfixup_setup);
module_param(irqfixup, int, 0644);
-MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
static int __init irqpoll_setup(char *str)
{
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 58762f7077ec..8078a32d3b10 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -41,10 +41,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
return ktime_to_timeval(rem);
}
+static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
+ struct itimerval *const value)
+{
+ cputime_t cval, cinterval;
+ struct cpu_itimer *it = &tsk->signal->it[clock_id];
+
+ spin_lock_irq(&tsk->sighand->siglock);
+
+ cval = it->expires;
+ cinterval = it->incr;
+ if (!cputime_eq(cval, cputime_zero)) {
+ struct task_cputime cputime;
+ cputime_t t;
+
+ thread_group_cputimer(tsk, &cputime);
+ if (clock_id == CPUCLOCK_PROF)
+ t = cputime_add(cputime.utime, cputime.stime);
+ else
+ /* CPUCLOCK_VIRT */
+ t = cputime.utime;
+
+ if (cputime_le(cval, t))
+ /* about to fire */
+ cval = cputime_one_jiffy;
+ else
+ cval = cputime_sub(cval, t);
+ }
+
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ cputime_to_timeval(cval, &value->it_value);
+ cputime_to_timeval(cinterval, &value->it_interval);
+}
+
int do_getitimer(int which, struct itimerval *value)
{
struct task_struct *tsk = current;
- cputime_t cinterval, cval;
switch (which) {
case ITIMER_REAL:
@@ -55,44 +88,10 @@ int do_getitimer(int which, struct itimerval *value)
spin_unlock_irq(&tsk->sighand->siglock);
break;
case ITIMER_VIRTUAL:
- spin_lock_irq(&tsk->sighand->siglock);
- cval = tsk->signal->it_virt_expires;
- cinterval = tsk->signal->it_virt_incr;
- if (!cputime_eq(cval, cputime_zero)) {
- struct task_cputime cputime;
- cputime_t utime;
-
- thread_group_cputimer(tsk, &cputime);
- utime = cputime.utime;
- if (cputime_le(cval, utime)) { /* about to fire */
- cval = jiffies_to_cputime(1);
- } else {
- cval = cputime_sub(cval, utime);
- }
- }
- spin_unlock_irq(&tsk->sighand->siglock);
- cputime_to_timeval(cval, &value->it_value);
- cputime_to_timeval(cinterval, &value->it_interval);
+ get_cpu_itimer(tsk, CPUCLOCK_VIRT, value);
break;
case ITIMER_PROF:
- spin_lock_irq(&tsk->sighand->siglock);
- cval = tsk->signal->it_prof_expires;
- cinterval = tsk->signal->it_prof_incr;
- if (!cputime_eq(cval, cputime_zero)) {
- struct task_cputime times;
- cputime_t ptime;
-
- thread_group_cputimer(tsk, &times);
- ptime = cputime_add(times.utime, times.stime);
- if (cputime_le(cval, ptime)) { /* about to fire */
- cval = jiffies_to_cputime(1);
- } else {
- cval = cputime_sub(cval, ptime);
- }
- }
- spin_unlock_irq(&tsk->sighand->siglock);
- cputime_to_timeval(cval, &value->it_value);
- cputime_to_timeval(cinterval, &value->it_interval);
+ get_cpu_itimer(tsk, CPUCLOCK_PROF, value);
break;
default:
return(-EINVAL);
@@ -128,6 +127,54 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
+{
+ struct timespec ts;
+ s64 cpu_ns;
+
+ cputime_to_timespec(ct, &ts);
+ cpu_ns = timespec_to_ns(&ts);
+
+ return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
+}
+
+static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
+ const struct itimerval *const value,
+ struct itimerval *const ovalue)
+{
+ cputime_t cval, nval, cinterval, ninterval;
+ s64 ns_ninterval, ns_nval;
+ struct cpu_itimer *it = &tsk->signal->it[clock_id];
+
+ nval = timeval_to_cputime(&value->it_value);
+ ns_nval = timeval_to_ns(&value->it_value);
+ ninterval = timeval_to_cputime(&value->it_interval);
+ ns_ninterval = timeval_to_ns(&value->it_interval);
+
+ it->incr_error = cputime_sub_ns(ninterval, ns_ninterval);
+ it->error = cputime_sub_ns(nval, ns_nval);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+
+ cval = it->expires;
+ cinterval = it->incr;
+ if (!cputime_eq(cval, cputime_zero) ||
+ !cputime_eq(nval, cputime_zero)) {
+ if (cputime_gt(nval, cputime_zero))
+ nval = cputime_add(nval, cputime_one_jiffy);
+ set_process_cpu_timer(tsk, clock_id, &nval, &cval);
+ }
+ it->expires = nval;
+ it->incr = ninterval;
+
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ if (ovalue) {
+ cputime_to_timeval(cval, &ovalue->it_value);
+ cputime_to_timeval(cinterval, &ovalue->it_interval);
+ }
+}
+
/*
* Returns true if the timeval is in canonical form
*/
@@ -139,7 +186,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
struct task_struct *tsk = current;
struct hrtimer *timer;
ktime_t expires;
- cputime_t cval, cinterval, nval, ninterval;
/*
* Validate the timevals in value.
@@ -174,48 +220,10 @@ again:
spin_unlock_irq(&tsk->sighand->siglock);
break;
case ITIMER_VIRTUAL:
- nval = timeval_to_cputime(&value->it_value);
- ninterval = timeval_to_cputime(&value->it_interval);
- spin_lock_irq(&tsk->sighand->siglock);
- cval = tsk->signal->it_virt_expires;
- cinterval = tsk->signal->it_virt_incr;
- if (!cputime_eq(cval, cputime_zero) ||
- !cputime_eq(nval, cputime_zero)) {
- if (cputime_gt(nval, cputime_zero))
- nval = cputime_add(nval,
- jiffies_to_cputime(1));
- set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
- &nval, &cval);
- }
- tsk->signal->it_virt_expires = nval;
- tsk->signal->it_virt_incr = ninterval;
- spin_unlock_irq(&tsk->sighand->siglock);
- if (ovalue) {
- cputime_to_timeval(cval, &ovalue->it_value);
- cputime_to_timeval(cinterval, &ovalue->it_interval);
- }
+ set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue);
break;
case ITIMER_PROF:
- nval = timeval_to_cputime(&value->it_value);
- ninterval = timeval_to_cputime(&value->it_interval);
- spin_lock_irq(&tsk->sighand->siglock);
- cval = tsk->signal->it_prof_expires;
- cinterval = tsk->signal->it_prof_incr;
- if (!cputime_eq(cval, cputime_zero) ||
- !cputime_eq(nval, cputime_zero)) {
- if (cputime_gt(nval, cputime_zero))
- nval = cputime_add(nval,
- jiffies_to_cputime(1));
- set_process_cpu_timer(tsk, CPUCLOCK_PROF,
- &nval, &cval);
- }
- tsk->signal->it_prof_expires = nval;
- tsk->signal->it_prof_incr = ninterval;
- spin_unlock_irq(&tsk->sighand->siglock);
- if (ovalue) {
- cputime_to_timeval(cval, &ovalue->it_value);
- cputime_to_timeval(cinterval, &ovalue->it_interval);
- }
+ set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue);
break;
default:
return -EINVAL;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5a7ae57f983f..94abc21c8371 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -37,6 +37,8 @@
#include <linux/suspend.h>
#include <asm/uaccess.h>
+#include <trace/events/module.h>
+
extern int max_threads;
static struct workqueue_struct *khelper_wq;
@@ -112,6 +114,8 @@ int __request_module(bool wait, const char *fmt, ...)
return -ENOMEM;
}
+ trace_module_request(module_name, wait, _RET_IP_);
+
ret = call_usermodehelper(modprobe_path, argv, envp,
wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
atomic_dec(&kmod_concurrent);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0540948e29ab..ef177d653b2c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -103,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
struct kprobe_insn_page {
- struct hlist_node hlist;
+ struct list_head list;
kprobe_opcode_t *insns; /* Page of instruction slots */
char slot_used[INSNS_PER_PAGE];
int nused;
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
};
static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
-static struct hlist_head kprobe_insn_pages;
+static LIST_HEAD(kprobe_insn_pages);
static int kprobe_garbage_slots;
static int collect_garbage_slots(void);
@@ -152,10 +152,9 @@ loop_end:
static kprobe_opcode_t __kprobes *__get_insn_slot(void)
{
struct kprobe_insn_page *kip;
- struct hlist_node *pos;
retry:
- hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
+ list_for_each_entry(kip, &kprobe_insn_pages, list) {
if (kip->nused < INSNS_PER_PAGE) {
int i;
for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -189,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
kfree(kip);
return NULL;
}
- INIT_HLIST_NODE(&kip->hlist);
- hlist_add_head(&kip->hlist, &kprobe_insn_pages);
+ INIT_LIST_HEAD(&kip->list);
+ list_add(&kip->list, &kprobe_insn_pages);
memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
kip->slot_used[0] = SLOT_USED;
kip->nused = 1;
@@ -219,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
* so as not to have to set it up again the
* next time somebody inserts a probe.
*/
- hlist_del(&kip->hlist);
- if (hlist_empty(&kprobe_insn_pages)) {
- INIT_HLIST_NODE(&kip->hlist);
- hlist_add_head(&kip->hlist,
- &kprobe_insn_pages);
- } else {
+ if (!list_is_singular(&kprobe_insn_pages)) {
+ list_del(&kip->list);
module_free(NULL, kip->insns);
kfree(kip);
}
@@ -235,14 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
static int __kprobes collect_garbage_slots(void)
{
- struct kprobe_insn_page *kip;
- struct hlist_node *pos, *next;
+ struct kprobe_insn_page *kip, *next;
/* Ensure no-one is preepmted on the garbages */
if (check_safety())
return -EAGAIN;
- hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
+ list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
int i;
if (kip->ngarbage == 0)
continue;
@@ -260,19 +254,17 @@ static int __kprobes collect_garbage_slots(void)
void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
{
struct kprobe_insn_page *kip;
- struct hlist_node *pos;
mutex_lock(&kprobe_insn_mutex);
- hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
+ list_for_each_entry(kip, &kprobe_insn_pages, list) {
if (kip->insns <= slot &&
slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
int i = (slot - kip->insns) / MAX_INSN_SIZE;
if (dirty) {
kip->slot_used[i] = SLOT_DIRTY;
kip->ngarbage++;
- } else {
+ } else
collect_one_slot(kip, i);
- }
break;
}
}
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8bbeef996c76..0843584aed5e 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,6 +42,7 @@
#include <linux/hash.h>
#include <linux/ftrace.h>
#include <linux/stringify.h>
+#include <linux/bitops.h>
#include <asm/sections.h>
@@ -366,11 +367,21 @@ static int save_trace(struct stack_trace *trace)
save_stack_trace(trace);
+ /*
+ * Some daft arches put -1 at the end to indicate its a full trace.
+ *
+ * <rant> this is buggy anyway, since it takes a whole extra entry so a
+ * complete trace that maxes out the entries provided will be reported
+ * as incomplete, friggin useless </rant>
+ */
+ if (trace->entries[trace->nr_entries-1] == ULONG_MAX)
+ trace->nr_entries--;
+
trace->max_entries = trace->nr_entries;
nr_stack_trace_entries += trace->nr_entries;
- if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
+ if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
if (!debug_locks_off_graph_unlock())
return 0;
@@ -390,19 +401,6 @@ unsigned int nr_process_chains;
unsigned int max_lockdep_depth;
unsigned int max_recursion_depth;
-static unsigned int lockdep_dependency_gen_id;
-
-static bool lockdep_dependency_visit(struct lock_class *source,
- unsigned int depth)
-{
- if (!depth)
- lockdep_dependency_gen_id++;
- if (source->dep_gen_id == lockdep_dependency_gen_id)
- return true;
- source->dep_gen_id = lockdep_dependency_gen_id;
- return false;
-}
-
#ifdef CONFIG_DEBUG_LOCKDEP
/*
* We cannot printk in early bootup code. Not even early_printk()
@@ -551,58 +549,6 @@ static void lockdep_print_held_locks(struct task_struct *curr)
}
}
-static void print_lock_class_header(struct lock_class *class, int depth)
-{
- int bit;
-
- printk("%*s->", depth, "");
- print_lock_name(class);
- printk(" ops: %lu", class->ops);
- printk(" {\n");
-
- for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
- if (class->usage_mask & (1 << bit)) {
- int len = depth;
-
- len += printk("%*s %s", depth, "", usage_str[bit]);
- len += printk(" at:\n");
- print_stack_trace(class->usage_traces + bit, len);
- }
- }
- printk("%*s }\n", depth, "");
-
- printk("%*s ... key at: ",depth,"");
- print_ip_sym((unsigned long)class->key);
-}
-
-/*
- * printk all lock dependencies starting at <entry>:
- */
-static void __used
-print_lock_dependencies(struct lock_class *class, int depth)
-{
- struct lock_list *entry;
-
- if (lockdep_dependency_visit(class, depth))
- return;
-
- if (DEBUG_LOCKS_WARN_ON(depth >= 20))
- return;
-
- print_lock_class_header(class, depth);
-
- list_for_each_entry(entry, &class->locks_after, entry) {
- if (DEBUG_LOCKS_WARN_ON(!entry->class))
- return;
-
- print_lock_dependencies(entry->class, depth + 1);
-
- printk("%*s ... acquired at:\n",depth,"");
- print_stack_trace(&entry->trace, 2);
- printk("\n");
- }
-}
-
static void print_kernel_version(void)
{
printk("%s %.*s\n", init_utsname()->release,
@@ -898,22 +844,203 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
}
/*
+ * For good efficiency of modular, we use power of 2
+ */
+#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
+#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
+
+/*
+ * The circular_queue and helpers is used to implement the
+ * breadth-first search(BFS)algorithem, by which we can build
+ * the shortest path from the next lock to be acquired to the
+ * previous held lock if there is a circular between them.
+ */
+struct circular_queue {
+ unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
+ unsigned int front, rear;
+};
+
+static struct circular_queue lock_cq;
+
+unsigned int max_bfs_queue_depth;
+
+static unsigned int lockdep_dependency_gen_id;
+
+static inline void __cq_init(struct circular_queue *cq)
+{
+ cq->front = cq->rear = 0;
+ lockdep_dependency_gen_id++;
+}
+
+static inline int __cq_empty(struct circular_queue *cq)
+{
+ return (cq->front == cq->rear);
+}
+
+static inline int __cq_full(struct circular_queue *cq)
+{
+ return ((cq->rear + 1) & CQ_MASK) == cq->front;
+}
+
+static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
+{
+ if (__cq_full(cq))
+ return -1;
+
+ cq->element[cq->rear] = elem;
+ cq->rear = (cq->rear + 1) & CQ_MASK;
+ return 0;
+}
+
+static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
+{
+ if (__cq_empty(cq))
+ return -1;
+
+ *elem = cq->element[cq->front];
+ cq->front = (cq->front + 1) & CQ_MASK;
+ return 0;
+}
+
+static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
+{
+ return (cq->rear - cq->front) & CQ_MASK;
+}
+
+static inline void mark_lock_accessed(struct lock_list *lock,
+ struct lock_list *parent)
+{
+ unsigned long nr;
+
+ nr = lock - list_entries;
+ WARN_ON(nr >= nr_list_entries);
+ lock->parent = parent;
+ lock->class->dep_gen_id = lockdep_dependency_gen_id;
+}
+
+static inline unsigned long lock_accessed(struct lock_list *lock)
+{
+ unsigned long nr;
+
+ nr = lock - list_entries;
+ WARN_ON(nr >= nr_list_entries);
+ return lock->class->dep_gen_id == lockdep_dependency_gen_id;
+}
+
+static inline struct lock_list *get_lock_parent(struct lock_list *child)
+{
+ return child->parent;
+}
+
+static inline int get_lock_depth(struct lock_list *child)
+{
+ int depth = 0;
+ struct lock_list *parent;
+
+ while ((parent = get_lock_parent(child))) {
+ child = parent;
+ depth++;
+ }
+ return depth;
+}
+
+static int __bfs(struct lock_list *source_entry,
+ void *data,
+ int (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry,
+ int forward)
+{
+ struct lock_list *entry;
+ struct list_head *head;
+ struct circular_queue *cq = &lock_cq;
+ int ret = 1;
+
+ if (match(source_entry, data)) {
+ *target_entry = source_entry;
+ ret = 0;
+ goto exit;
+ }
+
+ if (forward)
+ head = &source_entry->class->locks_after;
+ else
+ head = &source_entry->class->locks_before;
+
+ if (list_empty(head))
+ goto exit;
+
+ __cq_init(cq);
+ __cq_enqueue(cq, (unsigned long)source_entry);
+
+ while (!__cq_empty(cq)) {
+ struct lock_list *lock;
+
+ __cq_dequeue(cq, (unsigned long *)&lock);
+
+ if (!lock->class) {
+ ret = -2;
+ goto exit;
+ }
+
+ if (forward)
+ head = &lock->class->locks_after;
+ else
+ head = &lock->class->locks_before;
+
+ list_for_each_entry(entry, head, entry) {
+ if (!lock_accessed(entry)) {
+ unsigned int cq_depth;
+ mark_lock_accessed(entry, lock);
+ if (match(entry, data)) {
+ *target_entry = entry;
+ ret = 0;
+ goto exit;
+ }
+
+ if (__cq_enqueue(cq, (unsigned long)entry)) {
+ ret = -1;
+ goto exit;
+ }
+ cq_depth = __cq_get_elem_count(cq);
+ if (max_bfs_queue_depth < cq_depth)
+ max_bfs_queue_depth = cq_depth;
+ }
+ }
+ }
+exit:
+ return ret;
+}
+
+static inline int __bfs_forwards(struct lock_list *src_entry,
+ void *data,
+ int (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
+{
+ return __bfs(src_entry, data, match, target_entry, 1);
+
+}
+
+static inline int __bfs_backwards(struct lock_list *src_entry,
+ void *data,
+ int (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
+{
+ return __bfs(src_entry, data, match, target_entry, 0);
+
+}
+
+/*
* Recursive, forwards-direction lock-dependency checking, used for
* both noncyclic checking and for hardirq-unsafe/softirq-unsafe
* checking.
- *
- * (to keep the stackframe of the recursive functions small we
- * use these global variables, and we also mark various helper
- * functions as noinline.)
*/
-static struct held_lock *check_source, *check_target;
/*
* Print a dependency chain entry (this is only done when a deadlock
* has been detected):
*/
static noinline int
-print_circular_bug_entry(struct lock_list *target, unsigned int depth)
+print_circular_bug_entry(struct lock_list *target, int depth)
{
if (debug_locks_silent)
return 0;
@@ -930,11 +1057,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
* header first:
*/
static noinline int
-print_circular_bug_header(struct lock_list *entry, unsigned int depth)
+print_circular_bug_header(struct lock_list *entry, unsigned int depth,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
{
struct task_struct *curr = current;
- if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ if (debug_locks_silent)
return 0;
printk("\n=======================================================\n");
@@ -943,9 +1072,9 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
printk( "-------------------------------------------------------\n");
printk("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
- print_lock(check_source);
+ print_lock(check_src);
printk("\nbut task is already holding lock:\n");
- print_lock(check_target);
+ print_lock(check_tgt);
printk("\nwhich lock already depends on the new lock.\n\n");
printk("\nthe existing dependency chain (in reverse order) is:\n");
@@ -954,19 +1083,36 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
return 0;
}
-static noinline int print_circular_bug_tail(void)
+static inline int class_equal(struct lock_list *entry, void *data)
+{
+ return entry->class == data;
+}
+
+static noinline int print_circular_bug(struct lock_list *this,
+ struct lock_list *target,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
{
struct task_struct *curr = current;
- struct lock_list this;
+ struct lock_list *parent;
+ int depth;
- if (debug_locks_silent)
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- this.class = hlock_class(check_source);
- if (!save_trace(&this.trace))
+ if (!save_trace(&this->trace))
return 0;
- print_circular_bug_entry(&this, 0);
+ depth = get_lock_depth(target);
+
+ print_circular_bug_header(target, depth, check_src, check_tgt);
+
+ parent = get_lock_parent(target);
+
+ while (parent) {
+ print_circular_bug_entry(parent, --depth);
+ parent = get_lock_parent(parent);
+ }
printk("\nother info that might help us debug this:\n\n");
lockdep_print_held_locks(curr);
@@ -977,73 +1123,69 @@ static noinline int print_circular_bug_tail(void)
return 0;
}
-#define RECURSION_LIMIT 40
-
-static int noinline print_infinite_recursion_bug(void)
+static noinline int print_bfs_bug(int ret)
{
if (!debug_locks_off_graph_unlock())
return 0;
- WARN_ON(1);
+ WARN(1, "lockdep bfs error:%d\n", ret);
return 0;
}
-unsigned long __lockdep_count_forward_deps(struct lock_class *class,
- unsigned int depth)
+static int noop_count(struct lock_list *entry, void *data)
{
- struct lock_list *entry;
- unsigned long ret = 1;
+ (*(unsigned long *)data)++;
+ return 0;
+}
- if (lockdep_dependency_visit(class, depth))
- return 0;
+unsigned long __lockdep_count_forward_deps(struct lock_list *this)
+{
+ unsigned long count = 0;
+ struct lock_list *uninitialized_var(target_entry);
- /*
- * Recurse this class's dependency list:
- */
- list_for_each_entry(entry, &class->locks_after, entry)
- ret += __lockdep_count_forward_deps(entry->class, depth + 1);
+ __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
- return ret;
+ return count;
}
-
unsigned long lockdep_count_forward_deps(struct lock_class *class)
{
unsigned long ret, flags;
+ struct lock_list this;
+
+ this.parent = NULL;
+ this.class = class;
local_irq_save(flags);
__raw_spin_lock(&lockdep_lock);
- ret = __lockdep_count_forward_deps(class, 0);
+ ret = __lockdep_count_forward_deps(&this);
__raw_spin_unlock(&lockdep_lock);
local_irq_restore(flags);
return ret;
}
-unsigned long __lockdep_count_backward_deps(struct lock_class *class,
- unsigned int depth)
+unsigned long __lockdep_count_backward_deps(struct lock_list *this)
{
- struct lock_list *entry;
- unsigned long ret = 1;
+ unsigned long count = 0;
+ struct lock_list *uninitialized_var(target_entry);
- if (lockdep_dependency_visit(class, depth))
- return 0;
- /*
- * Recurse this class's dependency list:
- */
- list_for_each_entry(entry, &class->locks_before, entry)
- ret += __lockdep_count_backward_deps(entry->class, depth + 1);
+ __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
- return ret;
+ return count;
}
unsigned long lockdep_count_backward_deps(struct lock_class *class)
{
unsigned long ret, flags;
+ struct lock_list this;
+
+ this.parent = NULL;
+ this.class = class;
local_irq_save(flags);
__raw_spin_lock(&lockdep_lock);
- ret = __lockdep_count_backward_deps(class, 0);
+ ret = __lockdep_count_backward_deps(&this);
__raw_spin_unlock(&lockdep_lock);
local_irq_restore(flags);
@@ -1055,29 +1197,16 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
* lead to <target>. Print an error and return 0 if it does.
*/
static noinline int
-check_noncircular(struct lock_class *source, unsigned int depth)
+check_noncircular(struct lock_list *root, struct lock_class *target,
+ struct lock_list **target_entry)
{
- struct lock_list *entry;
+ int result;
- if (lockdep_dependency_visit(source, depth))
- return 1;
+ debug_atomic_inc(&nr_cyclic_checks);
- debug_atomic_inc(&nr_cyclic_check_recursions);
- if (depth > max_recursion_depth)
- max_recursion_depth = depth;
- if (depth >= RECURSION_LIMIT)
- return print_infinite_recursion_bug();
- /*
- * Check this lock's dependency list:
- */
- list_for_each_entry(entry, &source->locks_after, entry) {
- if (entry->class == hlock_class(check_target))
- return print_circular_bug_header(entry, depth+1);
- debug_atomic_inc(&nr_cyclic_checks);
- if (!check_noncircular(entry->class, depth+1))
- return print_circular_bug_entry(entry, depth+1);
- }
- return 1;
+ result = __bfs_forwards(root, target, class_equal, target_entry);
+
+ return result;
}
#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
@@ -1086,103 +1215,121 @@ check_noncircular(struct lock_class *source, unsigned int depth)
* proving that two subgraphs can be connected by a new dependency
* without creating any illegal irq-safe -> irq-unsafe lock dependency.
*/
-static enum lock_usage_bit find_usage_bit;
-static struct lock_class *forwards_match, *backwards_match;
+
+static inline int usage_match(struct lock_list *entry, void *bit)
+{
+ return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
+}
+
+
/*
* Find a node in the forwards-direction dependency sub-graph starting
- * at <source> that matches <find_usage_bit>.
+ * at @root->class that matches @bit.
*
- * Return 2 if such a node exists in the subgraph, and put that node
- * into <forwards_match>.
+ * Return 0 if such a node exists in the subgraph, and put that node
+ * into *@target_entry.
*
- * Return 1 otherwise and keep <forwards_match> unchanged.
- * Return 0 on error.
+ * Return 1 otherwise and keep *@target_entry unchanged.
+ * Return <0 on error.
*/
-static noinline int
-find_usage_forwards(struct lock_class *source, unsigned int depth)
+static int
+find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
+ struct lock_list **target_entry)
{
- struct lock_list *entry;
- int ret;
-
- if (lockdep_dependency_visit(source, depth))
- return 1;
-
- if (depth > max_recursion_depth)
- max_recursion_depth = depth;
- if (depth >= RECURSION_LIMIT)
- return print_infinite_recursion_bug();
+ int result;
debug_atomic_inc(&nr_find_usage_forwards_checks);
- if (source->usage_mask & (1 << find_usage_bit)) {
- forwards_match = source;
- return 2;
- }
- /*
- * Check this lock's dependency list:
- */
- list_for_each_entry(entry, &source->locks_after, entry) {
- debug_atomic_inc(&nr_find_usage_forwards_recursions);
- ret = find_usage_forwards(entry->class, depth+1);
- if (ret == 2 || ret == 0)
- return ret;
- }
- return 1;
+ result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
+
+ return result;
}
/*
* Find a node in the backwards-direction dependency sub-graph starting
- * at <source> that matches <find_usage_bit>.
+ * at @root->class that matches @bit.
*
- * Return 2 if such a node exists in the subgraph, and put that node
- * into <backwards_match>.
+ * Return 0 if such a node exists in the subgraph, and put that node
+ * into *@target_entry.
*
- * Return 1 otherwise and keep <backwards_match> unchanged.
- * Return 0 on error.
+ * Return 1 otherwise and keep *@target_entry unchanged.
+ * Return <0 on error.
*/
-static noinline int
-find_usage_backwards(struct lock_class *source, unsigned int depth)
+static int
+find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
+ struct lock_list **target_entry)
{
- struct lock_list *entry;
- int ret;
+ int result;
- if (lockdep_dependency_visit(source, depth))
- return 1;
+ debug_atomic_inc(&nr_find_usage_backwards_checks);
- if (!__raw_spin_is_locked(&lockdep_lock))
- return DEBUG_LOCKS_WARN_ON(1);
+ result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
- if (depth > max_recursion_depth)
- max_recursion_depth = depth;
- if (depth >= RECURSION_LIMIT)
- return print_infinite_recursion_bug();
+ return result;
+}
- debug_atomic_inc(&nr_find_usage_backwards_checks);
- if (source->usage_mask & (1 << find_usage_bit)) {
- backwards_match = source;
- return 2;
- }
+static void print_lock_class_header(struct lock_class *class, int depth)
+{
+ int bit;
- if (!source && debug_locks_off_graph_unlock()) {
- WARN_ON(1);
- return 0;
- }
+ printk("%*s->", depth, "");
+ print_lock_name(class);
+ printk(" ops: %lu", class->ops);
+ printk(" {\n");
- /*
- * Check this lock's dependency list:
- */
- list_for_each_entry(entry, &source->locks_before, entry) {
- debug_atomic_inc(&nr_find_usage_backwards_recursions);
- ret = find_usage_backwards(entry->class, depth+1);
- if (ret == 2 || ret == 0)
- return ret;
+ for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
+ if (class->usage_mask & (1 << bit)) {
+ int len = depth;
+
+ len += printk("%*s %s", depth, "", usage_str[bit]);
+ len += printk(" at:\n");
+ print_stack_trace(class->usage_traces + bit, len);
+ }
}
- return 1;
+ printk("%*s }\n", depth, "");
+
+ printk("%*s ... key at: ",depth,"");
+ print_ip_sym((unsigned long)class->key);
+}
+
+/*
+ * printk the shortest lock dependencies from @start to @end in reverse order:
+ */
+static void __used
+print_shortest_lock_dependencies(struct lock_list *leaf,
+ struct lock_list *root)
+{
+ struct lock_list *entry = leaf;
+ int depth;
+
+ /*compute depth from generated tree by BFS*/
+ depth = get_lock_depth(leaf);
+
+ do {
+ print_lock_class_header(entry->class, depth);
+ printk("%*s ... acquired at:\n", depth, "");
+ print_stack_trace(&entry->trace, 2);
+ printk("\n");
+
+ if (depth == 0 && (entry != root)) {
+ printk("lockdep:%s bad BFS generated tree\n", __func__);
+ break;
+ }
+
+ entry = get_lock_parent(entry);
+ depth--;
+ } while (entry && (depth >= 0));
+
+ return;
}
static int
print_bad_irq_dependency(struct task_struct *curr,
+ struct lock_list *prev_root,
+ struct lock_list *next_root,
+ struct lock_list *backwards_entry,
+ struct lock_list *forwards_entry,
struct held_lock *prev,
struct held_lock *next,
enum lock_usage_bit bit1,
@@ -1215,26 +1362,32 @@ print_bad_irq_dependency(struct task_struct *curr,
printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
irqclass);
- print_lock_name(backwards_match);
+ print_lock_name(backwards_entry->class);
printk("\n... which became %s-irq-safe at:\n", irqclass);
- print_stack_trace(backwards_match->usage_traces + bit1, 1);
+ print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
printk("\nto a %s-irq-unsafe lock:\n", irqclass);
- print_lock_name(forwards_match);
+ print_lock_name(forwards_entry->class);
printk("\n... which became %s-irq-unsafe at:\n", irqclass);
printk("...");
- print_stack_trace(forwards_match->usage_traces + bit2, 1);
+ print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
printk("\nother info that might help us debug this:\n\n");
lockdep_print_held_locks(curr);
- printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass);
- print_lock_dependencies(backwards_match, 0);
+ printk("\nthe dependencies between %s-irq-safe lock", irqclass);
+ printk(" and the holding lock:\n");
+ if (!save_trace(&prev_root->trace))
+ return 0;
+ print_shortest_lock_dependencies(backwards_entry, prev_root);
- printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass);
- print_lock_dependencies(forwards_match, 0);
+ printk("\nthe dependencies between the lock to be acquired");
+ printk(" and %s-irq-unsafe lock:\n", irqclass);
+ if (!save_trace(&next_root->trace))
+ return 0;
+ print_shortest_lock_dependencies(forwards_entry, next_root);
printk("\nstack backtrace:\n");
dump_stack();
@@ -1248,19 +1401,30 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
enum lock_usage_bit bit_forwards, const char *irqclass)
{
int ret;
+ struct lock_list this, that;
+ struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *uninitialized_var(target_entry1);
- find_usage_bit = bit_backwards;
- /* fills in <backwards_match> */
- ret = find_usage_backwards(hlock_class(prev), 0);
- if (!ret || ret == 1)
+ this.parent = NULL;
+
+ this.class = hlock_class(prev);
+ ret = find_usage_backwards(&this, bit_backwards, &target_entry);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
return ret;
- find_usage_bit = bit_forwards;
- ret = find_usage_forwards(hlock_class(next), 0);
- if (!ret || ret == 1)
+ that.parent = NULL;
+ that.class = hlock_class(next);
+ ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
return ret;
- /* ret == 2 */
- return print_bad_irq_dependency(curr, prev, next,
+
+ return print_bad_irq_dependency(curr, &this, &that,
+ target_entry, target_entry1,
+ prev, next,
bit_backwards, bit_forwards, irqclass);
}
@@ -1472,6 +1636,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
{
struct lock_list *entry;
int ret;
+ struct lock_list this;
+ struct lock_list *uninitialized_var(target_entry);
/*
* Prove that the new <prev> -> <next> dependency would not
@@ -1482,10 +1648,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* We are using global variables to control the recursion, to
* keep the stackframe size of the recursive functions low:
*/
- check_source = next;
- check_target = prev;
- if (!(check_noncircular(hlock_class(next), 0)))
- return print_circular_bug_tail();
+ this.class = hlock_class(next);
+ this.parent = NULL;
+ ret = check_noncircular(&this, hlock_class(prev), &target_entry);
+ if (unlikely(!ret))
+ return print_circular_bug(&this, target_entry, next, prev);
+ else if (unlikely(ret < 0))
+ return print_bfs_bug(ret);
if (!check_prev_add_irq(curr, prev, next))
return 0;
@@ -1884,7 +2053,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
* print irq inversion bug:
*/
static int
-print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
+print_irq_inversion_bug(struct task_struct *curr,
+ struct lock_list *root, struct lock_list *other,
struct held_lock *this, int forwards,
const char *irqclass)
{
@@ -1902,17 +2072,16 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
else
printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
- print_lock_name(other);
+ print_lock_name(other->class);
printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
printk("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
- printk("\nthe first lock's dependencies:\n");
- print_lock_dependencies(hlock_class(this), 0);
-
- printk("\nthe second lock's dependencies:\n");
- print_lock_dependencies(other, 0);
+ printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
+ if (!save_trace(&root->trace))
+ return 0;
+ print_shortest_lock_dependencies(other, root);
printk("\nstack backtrace:\n");
dump_stack();
@@ -1929,14 +2098,19 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit bit, const char *irqclass)
{
int ret;
-
- find_usage_bit = bit;
- /* fills in <forwards_match> */
- ret = find_usage_forwards(hlock_class(this), 0);
- if (!ret || ret == 1)
+ struct lock_list root;
+ struct lock_list *uninitialized_var(target_entry);
+
+ root.parent = NULL;
+ root.class = hlock_class(this);
+ ret = find_usage_forwards(&root, bit, &target_entry);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
return ret;
- return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass);
+ return print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, irqclass);
}
/*
@@ -1948,14 +2122,19 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit bit, const char *irqclass)
{
int ret;
-
- find_usage_bit = bit;
- /* fills in <backwards_match> */
- ret = find_usage_backwards(hlock_class(this), 0);
- if (!ret || ret == 1)
+ struct lock_list root;
+ struct lock_list *uninitialized_var(target_entry);
+
+ root.parent = NULL;
+ root.class = hlock_class(this);
+ ret = find_usage_backwards(&root, bit, &target_entry);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
return ret;
- return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass);
+ return print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, irqclass);
}
void print_irqtrace_events(struct task_struct *curr)
@@ -2530,13 +2709,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
*/
static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check, int hardirqs_off,
- struct lockdep_map *nest_lock, unsigned long ip)
+ struct lockdep_map *nest_lock, unsigned long ip,
+ int references)
{
struct task_struct *curr = current;
struct lock_class *class = NULL;
struct held_lock *hlock;
unsigned int depth, id;
int chain_head = 0;
+ int class_idx;
u64 chain_key;
if (!prove_locking)
@@ -2584,10 +2765,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
return 0;
+ class_idx = class - lock_classes + 1;
+
+ if (depth) {
+ hlock = curr->held_locks + depth - 1;
+ if (hlock->class_idx == class_idx && nest_lock) {
+ if (hlock->references)
+ hlock->references++;
+ else
+ hlock->references = 2;
+
+ return 1;
+ }
+ }
+
hlock = curr->held_locks + depth;
if (DEBUG_LOCKS_WARN_ON(!class))
return 0;
- hlock->class_idx = class - lock_classes + 1;
+ hlock->class_idx = class_idx;
hlock->acquire_ip = ip;
hlock->instance = lock;
hlock->nest_lock = nest_lock;
@@ -2595,6 +2790,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->read = read;
hlock->check = check;
hlock->hardirqs_off = !!hardirqs_off;
+ hlock->references = references;
#ifdef CONFIG_LOCK_STAT
hlock->waittime_stamp = 0;
hlock->holdtime_stamp = sched_clock();
@@ -2703,6 +2899,30 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
return 1;
}
+static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
+{
+ if (hlock->instance == lock)
+ return 1;
+
+ if (hlock->references) {
+ struct lock_class *class = lock->class_cache;
+
+ if (!class)
+ class = look_up_lock_class(lock, 0);
+
+ if (DEBUG_LOCKS_WARN_ON(!class))
+ return 0;
+
+ if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
+ return 0;
+
+ if (hlock->class_idx == class - lock_classes + 1)
+ return 1;
+ }
+
+ return 0;
+}
+
static int
__lock_set_class(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, unsigned int subclass,
@@ -2726,7 +2946,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
*/
if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
break;
- if (hlock->instance == lock)
+ if (match_held_lock(hlock, lock))
goto found_it;
prev_hlock = hlock;
}
@@ -2745,7 +2965,8 @@ found_it:
if (!__lock_acquire(hlock->instance,
hlock_class(hlock)->subclass, hlock->trylock,
hlock->read, hlock->check, hlock->hardirqs_off,
- hlock->nest_lock, hlock->acquire_ip))
+ hlock->nest_lock, hlock->acquire_ip,
+ hlock->references))
return 0;
}
@@ -2784,20 +3005,34 @@ lock_release_non_nested(struct task_struct *curr,
*/
if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
break;
- if (hlock->instance == lock)
+ if (match_held_lock(hlock, lock))
goto found_it;
prev_hlock = hlock;
}
return print_unlock_inbalance_bug(curr, lock, ip);
found_it:
- lock_release_holdtime(hlock);
+ if (hlock->instance == lock)
+ lock_release_holdtime(hlock);
+
+ if (hlock->references) {
+ hlock->references--;
+ if (hlock->references) {
+ /*
+ * We had, and after removing one, still have
+ * references, the current lock stack is still
+ * valid. We're done!
+ */
+ return 1;
+ }
+ }
/*
* We have the right lock to unlock, 'hlock' points to it.
* Now we remove it from the stack, and add back the other
* entries (if any), recalculating the hash along the way:
*/
+
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
@@ -2806,7 +3041,8 @@ found_it:
if (!__lock_acquire(hlock->instance,
hlock_class(hlock)->subclass, hlock->trylock,
hlock->read, hlock->check, hlock->hardirqs_off,
- hlock->nest_lock, hlock->acquire_ip))
+ hlock->nest_lock, hlock->acquire_ip,
+ hlock->references))
return 0;
}
@@ -2836,7 +3072,7 @@ static int lock_release_nested(struct task_struct *curr,
/*
* Is the unlock non-nested:
*/
- if (hlock->instance != lock)
+ if (hlock->instance != lock || hlock->references)
return lock_release_non_nested(curr, lock, ip);
curr->lockdep_depth--;
@@ -2881,6 +3117,21 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
check_chain_key(curr);
}
+static int __lock_is_held(struct lockdep_map *lock)
+{
+ struct task_struct *curr = current;
+ int i;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
+
+ if (match_held_lock(hlock, lock))
+ return 1;
+ }
+
+ return 0;
+}
+
/*
* Check whether we follow the irq-flags state precisely:
*/
@@ -2957,7 +3208,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
current->lockdep_recursion = 1;
__lock_acquire(lock, subclass, trylock, read, check,
- irqs_disabled_flags(flags), nest_lock, ip);
+ irqs_disabled_flags(flags), nest_lock, ip, 0);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
}
@@ -2982,6 +3233,26 @@ void lock_release(struct lockdep_map *lock, int nested,
}
EXPORT_SYMBOL_GPL(lock_release);
+int lock_is_held(struct lockdep_map *lock)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ if (unlikely(current->lockdep_recursion))
+ return ret;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ ret = __lock_is_held(lock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lock_is_held);
+
void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
{
current->lockdep_reclaim_gfp = gfp_mask;
@@ -3041,7 +3312,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
*/
if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
break;
- if (hlock->instance == lock)
+ if (match_held_lock(hlock, lock))
goto found_it;
prev_hlock = hlock;
}
@@ -3049,6 +3320,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
return;
found_it:
+ if (hlock->instance != lock)
+ return;
+
hlock->waittime_stamp = sched_clock();
contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
@@ -3088,7 +3362,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
*/
if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
break;
- if (hlock->instance == lock)
+ if (match_held_lock(hlock, lock))
goto found_it;
prev_hlock = hlock;
}
@@ -3096,6 +3370,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
return;
found_it:
+ if (hlock->instance != lock)
+ return;
+
cpu = smp_processor_id();
if (hlock->waittime_stamp) {
now = sched_clock();
@@ -3326,7 +3603,12 @@ void __init lockdep_info(void)
sizeof(struct list_head) * CLASSHASH_SIZE +
sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
- sizeof(struct list_head) * CHAINHASH_SIZE) / 1024);
+ sizeof(struct list_head) * CHAINHASH_SIZE
+#ifdef CONFIG_PROVE_LOCKING
+ + sizeof(struct circular_queue)
+#endif
+ ) / 1024
+ );
printk(" per task-struct memory footprint: %lu bytes\n",
sizeof(struct held_lock) * MAX_LOCK_DEPTH);
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 699a2ac3a0d7..a2ee95ad1313 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -91,6 +91,8 @@ extern unsigned int nr_process_chains;
extern unsigned int max_lockdep_depth;
extern unsigned int max_recursion_depth;
+extern unsigned int max_bfs_queue_depth;
+
#ifdef CONFIG_PROVE_LOCKING
extern unsigned long lockdep_count_forward_deps(struct lock_class *);
extern unsigned long lockdep_count_backward_deps(struct lock_class *);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index e94caa666dba..37b6ddc87f53 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -25,38 +25,12 @@
static void *l_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct lock_class *class;
-
- (*pos)++;
-
- if (v == SEQ_START_TOKEN)
- class = m->private;
- else {
- class = v;
-
- if (class->lock_entry.next != &all_lock_classes)
- class = list_entry(class->lock_entry.next,
- struct lock_class, lock_entry);
- else
- class = NULL;
- }
-
- return class;
+ return seq_list_next(v, &all_lock_classes, pos);
}
static void *l_start(struct seq_file *m, loff_t *pos)
{
- struct lock_class *class;
- loff_t i = 0;
-
- if (*pos == 0)
- return SEQ_START_TOKEN;
-
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
- if (++i == *pos)
- return class;
- }
- return NULL;
+ return seq_list_start_head(&all_lock_classes, *pos);
}
static void l_stop(struct seq_file *m, void *v)
@@ -82,11 +56,11 @@ static void print_name(struct seq_file *m, struct lock_class *class)
static int l_show(struct seq_file *m, void *v)
{
- struct lock_class *class = v;
+ struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
struct lock_list *entry;
char usage[LOCK_USAGE_CHARS];
- if (v == SEQ_START_TOKEN) {
+ if (v == &all_lock_classes) {
seq_printf(m, "all lock classes:\n");
return 0;
}
@@ -128,17 +102,7 @@ static const struct seq_operations lockdep_ops = {
static int lockdep_open(struct inode *inode, struct file *file)
{
- int res = seq_open(file, &lockdep_ops);
- if (!res) {
- struct seq_file *m = file->private_data;
-
- if (!list_empty(&all_lock_classes))
- m->private = list_entry(all_lock_classes.next,
- struct lock_class, lock_entry);
- else
- m->private = NULL;
- }
- return res;
+ return seq_open(file, &lockdep_ops);
}
static const struct file_operations proc_lockdep_operations = {
@@ -149,37 +113,23 @@ static const struct file_operations proc_lockdep_operations = {
};
#ifdef CONFIG_PROVE_LOCKING
-static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
-{
- struct lock_chain *chain;
-
- (*pos)++;
-
- if (v == SEQ_START_TOKEN)
- chain = m->private;
- else {
- chain = v;
-
- if (*pos < nr_lock_chains)
- chain = lock_chains + *pos;
- else
- chain = NULL;
- }
-
- return chain;
-}
-
static void *lc_start(struct seq_file *m, loff_t *pos)
{
if (*pos == 0)
return SEQ_START_TOKEN;
- if (*pos < nr_lock_chains)
- return lock_chains + *pos;
+ if (*pos - 1 < nr_lock_chains)
+ return lock_chains + (*pos - 1);
return NULL;
}
+static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return lc_start(m, pos);
+}
+
static void lc_stop(struct seq_file *m, void *v)
{
}
@@ -220,16 +170,7 @@ static const struct seq_operations lockdep_chains_ops = {
static int lockdep_chains_open(struct inode *inode, struct file *file)
{
- int res = seq_open(file, &lockdep_chains_ops);
- if (!res) {
- struct seq_file *m = file->private_data;
-
- if (nr_lock_chains)
- m->private = lock_chains;
- else
- m->private = NULL;
- }
- return res;
+ return seq_open(file, &lockdep_chains_ops);
}
static const struct file_operations proc_lockdep_chains_operations = {
@@ -411,6 +352,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
max_lockdep_depth);
seq_printf(m, " max recursion depth: %11u\n",
max_recursion_depth);
+#ifdef CONFIG_PROVE_LOCKING
+ seq_printf(m, " max bfs queue depth: %11u\n",
+ max_bfs_queue_depth);
+#endif
lockdep_stats_debug_show(m);
seq_printf(m, " debug_locks: %11u\n",
debug_locks);
@@ -438,7 +383,6 @@ struct lock_stat_data {
};
struct lock_stat_seq {
- struct lock_stat_data *iter;
struct lock_stat_data *iter_end;
struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
};
@@ -626,34 +570,22 @@ static void seq_header(struct seq_file *m)
static void *ls_start(struct seq_file *m, loff_t *pos)
{
struct lock_stat_seq *data = m->private;
+ struct lock_stat_data *iter;
if (*pos == 0)
return SEQ_START_TOKEN;
- data->iter = data->stats + *pos;
- if (data->iter >= data->iter_end)
- data->iter = NULL;
+ iter = data->stats + (*pos - 1);
+ if (iter >= data->iter_end)
+ iter = NULL;
- return data->iter;
+ return iter;
}
static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct lock_stat_seq *data = m->private;
-
(*pos)++;
-
- if (v == SEQ_START_TOKEN)
- data->iter = data->stats;
- else {
- data->iter = v;
- data->iter++;
- }
-
- if (data->iter == data->iter_end)
- data->iter = NULL;
-
- return data->iter;
+ return ls_start(m, pos);
}
static void ls_stop(struct seq_file *m, void *v)
@@ -691,7 +623,6 @@ static int lock_stat_open(struct inode *inode, struct file *file)
struct lock_stat_data *iter = data->stats;
struct seq_file *m = file->private_data;
- data->iter = iter;
list_for_each_entry(class, &all_lock_classes, lock_entry) {
iter->class = class;
iter->stats = lock_stats(class);
@@ -699,7 +630,7 @@ static int lock_stat_open(struct inode *inode, struct file *file)
}
data->iter_end = iter;
- sort(data->stats, data->iter_end - data->iter,
+ sort(data->stats, data->iter_end - data->stats,
sizeof(struct lock_stat_data),
lock_stat_cmp, NULL);
@@ -734,7 +665,6 @@ static int lock_stat_release(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
vfree(seq->private);
- seq->private = NULL;
return seq_release(inode, file);
}
diff --git a/kernel/module.c b/kernel/module.c
index fd1411403558..b1821438694e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,11 @@
#include <linux/percpu.h>
#include <linux/kmemleak.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/module.h>
+
+EXPORT_TRACEPOINT_SYMBOL(module_get);
+
#if 0
#define DEBUGP printk
#else
@@ -940,6 +945,8 @@ void module_put(struct module *module)
if (module) {
unsigned int cpu = get_cpu();
local_dec(__module_ref_addr(module, cpu));
+ trace_module_put(module, _RET_IP_,
+ local_read(__module_ref_addr(module, cpu)));
/* Maybe they're waiting for us to drop reference? */
if (unlikely(!module_is_live(module)))
wake_up_process(module->waiter);
@@ -1491,6 +1498,8 @@ static int __unlink_module(void *_mod)
/* Free a module, remove from lists, etc (must hold module_mutex). */
static void free_module(struct module *mod)
{
+ trace_module_free(mod);
+
/* Delete from various lists */
stop_machine(__unlink_module, mod, NULL);
remove_notes_attrs(mod);
@@ -2358,6 +2367,8 @@ static noinline struct module *load_module(void __user *umod,
/* Get rid of temporary copy */
vfree(hdr);
+ trace_module_load(mod);
+
/* Done! */
return mod;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 534e20d14d63..b8fe7397b902 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1503,10 +1503,21 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
*/
static void __perf_counter_read(void *info)
{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
struct perf_counter *counter = info;
struct perf_counter_context *ctx = counter->ctx;
unsigned long flags;
+ /*
+ * If this is a task context, we need to check whether it is
+ * the current task context of this cpu. If not it has been
+ * scheduled out before the smp call arrived. In that case
+ * counter->count would have been updated to a recent sample
+ * when the counter was scheduled out.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
local_irq_save(flags);
if (ctx->is_active)
update_context_time(ctx);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e33a21cb9407..12161f74744e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -14,11 +14,11 @@
*/
void update_rlimit_cpu(unsigned long rlim_new)
{
- cputime_t cputime;
+ cputime_t cputime = secs_to_cputime(rlim_new);
+ struct signal_struct *const sig = current->signal;
- cputime = secs_to_cputime(rlim_new);
- if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
- cputime_gt(current->signal->it_prof_expires, cputime)) {
+ if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) ||
+ cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) {
spin_lock_irq(&current->sighand->siglock);
set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
spin_unlock_irq(&current->sighand->siglock);
@@ -542,6 +542,17 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
now);
}
+static inline int expires_gt(cputime_t expires, cputime_t new_exp)
+{
+ return cputime_eq(expires, cputime_zero) ||
+ cputime_gt(expires, new_exp);
+}
+
+static inline int expires_le(cputime_t expires, cputime_t new_exp)
+{
+ return !cputime_eq(expires, cputime_zero) &&
+ cputime_le(expires, new_exp);
+}
/*
* Insert the timer on the appropriate list before any timers that
* expire later. This must be called with the tasklist_lock held
@@ -586,34 +597,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
*/
if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+ union cpu_time_count *exp = &nt->expires;
+
switch (CPUCLOCK_WHICH(timer->it_clock)) {
default:
BUG();
case CPUCLOCK_PROF:
- if (cputime_eq(p->cputime_expires.prof_exp,
- cputime_zero) ||
- cputime_gt(p->cputime_expires.prof_exp,
- nt->expires.cpu))
- p->cputime_expires.prof_exp =
- nt->expires.cpu;
+ if (expires_gt(p->cputime_expires.prof_exp,
+ exp->cpu))
+ p->cputime_expires.prof_exp = exp->cpu;
break;
case CPUCLOCK_VIRT:
- if (cputime_eq(p->cputime_expires.virt_exp,
- cputime_zero) ||
- cputime_gt(p->cputime_expires.virt_exp,
- nt->expires.cpu))
- p->cputime_expires.virt_exp =
- nt->expires.cpu;
+ if (expires_gt(p->cputime_expires.virt_exp,
+ exp->cpu))
+ p->cputime_expires.virt_exp = exp->cpu;
break;
case CPUCLOCK_SCHED:
if (p->cputime_expires.sched_exp == 0 ||
- p->cputime_expires.sched_exp >
- nt->expires.sched)
+ p->cputime_expires.sched_exp > exp->sched)
p->cputime_expires.sched_exp =
- nt->expires.sched;
+ exp->sched;
break;
}
} else {
+ struct signal_struct *const sig = p->signal;
+ union cpu_time_count *exp = &timer->it.cpu.expires;
+
/*
* For a process timer, set the cached expiration time.
*/
@@ -621,30 +630,23 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
default:
BUG();
case CPUCLOCK_VIRT:
- if (!cputime_eq(p->signal->it_virt_expires,
- cputime_zero) &&
- cputime_lt(p->signal->it_virt_expires,
- timer->it.cpu.expires.cpu))
+ if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
+ exp->cpu))
break;
- p->signal->cputime_expires.virt_exp =
- timer->it.cpu.expires.cpu;
+ sig->cputime_expires.virt_exp = exp->cpu;
break;
case CPUCLOCK_PROF:
- if (!cputime_eq(p->signal->it_prof_expires,
- cputime_zero) &&
- cputime_lt(p->signal->it_prof_expires,
- timer->it.cpu.expires.cpu))
+ if (expires_le(sig->it[CPUCLOCK_PROF].expires,
+ exp->cpu))
break;
- i = p->signal->rlim[RLIMIT_CPU].rlim_cur;
+ i = sig->rlim[RLIMIT_CPU].rlim_cur;
if (i != RLIM_INFINITY &&
- i <= cputime_to_secs(timer->it.cpu.expires.cpu))
+ i <= cputime_to_secs(exp->cpu))
break;
- p->signal->cputime_expires.prof_exp =
- timer->it.cpu.expires.cpu;
+ sig->cputime_expires.prof_exp = exp->cpu;
break;
case CPUCLOCK_SCHED:
- p->signal->cputime_expires.sched_exp =
- timer->it.cpu.expires.sched;
+ sig->cputime_expires.sched_exp = exp->sched;
break;
}
}
@@ -1071,6 +1073,36 @@ static void stop_process_timers(struct task_struct *tsk)
spin_unlock_irqrestore(&cputimer->lock, flags);
}
+static u32 onecputick;
+
+static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
+ cputime_t *expires, cputime_t cur_time, int signo)
+{
+ if (cputime_eq(it->expires, cputime_zero))
+ return;
+
+ if (cputime_ge(cur_time, it->expires)) {
+ if (!cputime_eq(it->incr, cputime_zero)) {
+ it->expires = cputime_add(it->expires, it->incr);
+ it->error += it->incr_error;
+ if (it->error >= onecputick) {
+ it->expires = cputime_sub(it->expires,
+ cputime_one_jiffy);
+ it->error -= onecputick;
+ }
+ } else
+ it->expires = cputime_zero;
+
+ __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
+ }
+
+ if (!cputime_eq(it->expires, cputime_zero) &&
+ (cputime_eq(*expires, cputime_zero) ||
+ cputime_lt(it->expires, *expires))) {
+ *expires = it->expires;
+ }
+}
+
/*
* Check for any per-thread CPU timers that have fired and move them
* off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1090,10 +1122,10 @@ static void check_process_timers(struct task_struct *tsk,
* Don't sample the current process CPU clocks if there are no timers.
*/
if (list_empty(&timers[CPUCLOCK_PROF]) &&
- cputime_eq(sig->it_prof_expires, cputime_zero) &&
+ cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
list_empty(&timers[CPUCLOCK_VIRT]) &&
- cputime_eq(sig->it_virt_expires, cputime_zero) &&
+ cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
list_empty(&timers[CPUCLOCK_SCHED])) {
stop_process_timers(tsk);
return;
@@ -1153,38 +1185,11 @@ static void check_process_timers(struct task_struct *tsk,
/*
* Check for the special case process timers.
*/
- if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
- if (cputime_ge(ptime, sig->it_prof_expires)) {
- /* ITIMER_PROF fires and reloads. */
- sig->it_prof_expires = sig->it_prof_incr;
- if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
- sig->it_prof_expires = cputime_add(
- sig->it_prof_expires, ptime);
- }
- __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk);
- }
- if (!cputime_eq(sig->it_prof_expires, cputime_zero) &&
- (cputime_eq(prof_expires, cputime_zero) ||
- cputime_lt(sig->it_prof_expires, prof_expires))) {
- prof_expires = sig->it_prof_expires;
- }
- }
- if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
- if (cputime_ge(utime, sig->it_virt_expires)) {
- /* ITIMER_VIRTUAL fires and reloads. */
- sig->it_virt_expires = sig->it_virt_incr;
- if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
- sig->it_virt_expires = cputime_add(
- sig->it_virt_expires, utime);
- }
- __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk);
- }
- if (!cputime_eq(sig->it_virt_expires, cputime_zero) &&
- (cputime_eq(virt_expires, cputime_zero) ||
- cputime_lt(sig->it_virt_expires, virt_expires))) {
- virt_expires = sig->it_virt_expires;
- }
- }
+ check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
+ SIGPROF);
+ check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
+ SIGVTALRM);
+
if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
unsigned long psecs = cputime_to_secs(ptime);
cputime_t x;
@@ -1457,7 +1462,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
if (!cputime_eq(*oldval, cputime_zero)) {
if (cputime_le(*oldval, now.cpu)) {
/* Just about to fire. */
- *oldval = jiffies_to_cputime(1);
+ *oldval = cputime_one_jiffy;
} else {
*oldval = cputime_sub(*oldval, now.cpu);
}
@@ -1703,10 +1708,15 @@ static __init int init_posix_cpu_timers(void)
.nsleep = thread_cpu_nsleep,
.nsleep_restart = thread_cpu_nsleep_restart,
};
+ struct timespec ts;
register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
+ cputime_to_timespec(cputime_one_jiffy, &ts);
+ onecputick = ts.tv_nsec;
+ WARN_ON(ts.tv_sec != 0);
+
return 0;
}
__initcall(init_posix_cpu_timers);
diff --git a/kernel/printk.c b/kernel/printk.c
index b4d97b54c1ec..e10d193a833a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -37,6 +37,12 @@
#include <asm/uaccess.h>
/*
+ * for_each_console() allows you to iterate on each console
+ */
+#define for_each_console(con) \
+ for (con = console_drivers; con != NULL; con = con->next)
+
+/*
* Architectures can override it:
*/
void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -61,6 +67,8 @@ int console_printk[4] = {
DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
};
+static int saved_console_loglevel = -1;
+
/*
* Low level drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
@@ -372,10 +380,15 @@ int do_syslog(int type, char __user *buf, int len)
logged_chars = 0;
break;
case 6: /* Disable logging to console */
+ if (saved_console_loglevel == -1)
+ saved_console_loglevel = console_loglevel;
console_loglevel = minimum_console_loglevel;
break;
case 7: /* Enable logging to console */
- console_loglevel = default_console_loglevel;
+ if (saved_console_loglevel != -1) {
+ console_loglevel = saved_console_loglevel;
+ saved_console_loglevel = -1;
+ }
break;
case 8: /* Set level of messages printed to console */
error = -EINVAL;
@@ -384,6 +397,8 @@ int do_syslog(int type, char __user *buf, int len)
if (len < minimum_console_loglevel)
len = minimum_console_loglevel;
console_loglevel = len;
+ /* Implicitly re-enable logging to console */
+ saved_console_loglevel = -1;
error = 0;
break;
case 9: /* Number of chars in the log buffer */
@@ -412,7 +427,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
{
struct console *con;
- for (con = console_drivers; con; con = con->next) {
+ for_each_console(con) {
if ((con->flags & CON_ENABLED) && con->write &&
(cpu_online(smp_processor_id()) ||
(con->flags & CON_ANYTIME)))
@@ -544,7 +559,7 @@ static int have_callable_console(void)
{
struct console *con;
- for (con = console_drivers; con; con = con->next)
+ for_each_console(con)
if (con->flags & CON_ANYTIME)
return 1;
@@ -1082,7 +1097,7 @@ void console_unblank(void)
console_locked = 1;
console_may_schedule = 0;
- for (c = console_drivers; c != NULL; c = c->next)
+ for_each_console(c)
if ((c->flags & CON_ENABLED) && c->unblank)
c->unblank();
release_console_sem();
@@ -1097,7 +1112,7 @@ struct tty_driver *console_device(int *index)
struct tty_driver *driver = NULL;
acquire_console_sem();
- for (c = console_drivers; c != NULL; c = c->next) {
+ for_each_console(c) {
if (!c->device)
continue;
driver = c->device(c, index);
@@ -1134,25 +1149,49 @@ EXPORT_SYMBOL(console_start);
* to register the console printing procedure with printk() and to
* print any messages that were printed by the kernel before the
* console driver was initialized.
+ *
+ * This can happen pretty early during the boot process (because of
+ * early_printk) - sometimes before setup_arch() completes - be careful
+ * of what kernel features are used - they may not be initialised yet.
+ *
+ * There are two types of consoles - bootconsoles (early_printk) and
+ * "real" consoles (everything which is not a bootconsole) which are
+ * handled differently.
+ * - Any number of bootconsoles can be registered at any time.
+ * - As soon as a "real" console is registered, all bootconsoles
+ * will be unregistered automatically.
+ * - Once a "real" console is registered, any attempt to register a
+ * bootconsoles will be rejected
*/
-void register_console(struct console *console)
+void register_console(struct console *newcon)
{
int i;
unsigned long flags;
- struct console *bootconsole = NULL;
+ struct console *bcon = NULL;
- if (console_drivers) {
- if (console->flags & CON_BOOT)
- return;
- if (console_drivers->flags & CON_BOOT)
- bootconsole = console_drivers;
+ /*
+ * before we register a new CON_BOOT console, make sure we don't
+ * already have a valid console
+ */
+ if (console_drivers && newcon->flags & CON_BOOT) {
+ /* find the last or real console */
+ for_each_console(bcon) {
+ if (!(bcon->flags & CON_BOOT)) {
+ printk(KERN_INFO "Too late to register bootconsole %s%d\n",
+ newcon->name, newcon->index);
+ return;
+ }
+ }
}
- if (preferred_console < 0 || bootconsole || !console_drivers)
+ if (console_drivers && console_drivers->flags & CON_BOOT)
+ bcon = console_drivers;
+
+ if (preferred_console < 0 || bcon || !console_drivers)
preferred_console = selected_console;
- if (console->early_setup)
- console->early_setup();
+ if (newcon->early_setup)
+ newcon->early_setup();
/*
* See if we want to use this console driver. If we
@@ -1160,13 +1199,13 @@ void register_console(struct console *console)
* that registers here.
*/
if (preferred_console < 0) {
- if (console->index < 0)
- console->index = 0;
- if (console->setup == NULL ||
- console->setup(console, NULL) == 0) {
- console->flags |= CON_ENABLED;
- if (console->device) {
- console->flags |= CON_CONSDEV;
+ if (newcon->index < 0)
+ newcon->index = 0;
+ if (newcon->setup == NULL ||
+ newcon->setup(newcon, NULL) == 0) {
+ newcon->flags |= CON_ENABLED;
+ if (newcon->device) {
+ newcon->flags |= CON_CONSDEV;
preferred_console = 0;
}
}
@@ -1178,64 +1217,62 @@ void register_console(struct console *console)
*/
for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
i++) {
- if (strcmp(console_cmdline[i].name, console->name) != 0)
+ if (strcmp(console_cmdline[i].name, newcon->name) != 0)
continue;
- if (console->index >= 0 &&
- console->index != console_cmdline[i].index)
+ if (newcon->index >= 0 &&
+ newcon->index != console_cmdline[i].index)
continue;
- if (console->index < 0)
- console->index = console_cmdline[i].index;
+ if (newcon->index < 0)
+ newcon->index = console_cmdline[i].index;
#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
if (console_cmdline[i].brl_options) {
- console->flags |= CON_BRL;
- braille_register_console(console,
+ newcon->flags |= CON_BRL;
+ braille_register_console(newcon,
console_cmdline[i].index,
console_cmdline[i].options,
console_cmdline[i].brl_options);
return;
}
#endif
- if (console->setup &&
- console->setup(console, console_cmdline[i].options) != 0)
+ if (newcon->setup &&
+ newcon->setup(newcon, console_cmdline[i].options) != 0)
break;
- console->flags |= CON_ENABLED;
- console->index = console_cmdline[i].index;
+ newcon->flags |= CON_ENABLED;
+ newcon->index = console_cmdline[i].index;
if (i == selected_console) {
- console->flags |= CON_CONSDEV;
+ newcon->flags |= CON_CONSDEV;
preferred_console = selected_console;
}
break;
}
- if (!(console->flags & CON_ENABLED))
+ if (!(newcon->flags & CON_ENABLED))
return;
- if (bootconsole && (console->flags & CON_CONSDEV)) {
- printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n",
- bootconsole->name, bootconsole->index,
- console->name, console->index);
- unregister_console(bootconsole);
- console->flags &= ~CON_PRINTBUFFER;
- } else {
- printk(KERN_INFO "console [%s%d] enabled\n",
- console->name, console->index);
- }
+ /*
+ * If we have a bootconsole, and are switching to a real console,
+ * don't print everything out again, since when the boot console, and
+ * the real console are the same physical device, it's annoying to
+ * see the beginning boot messages twice
+ */
+ if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
+ newcon->flags &= ~CON_PRINTBUFFER;
/*
* Put this console in the list - keep the
* preferred driver at the head of the list.
*/
acquire_console_sem();
- if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
- console->next = console_drivers;
- console_drivers = console;
- if (console->next)
- console->next->flags &= ~CON_CONSDEV;
+ if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
+ newcon->next = console_drivers;
+ console_drivers = newcon;
+ if (newcon->next)
+ newcon->next->flags &= ~CON_CONSDEV;
} else {
- console->next = console_drivers->next;
- console_drivers->next = console;
+ newcon->next = console_drivers->next;
+ console_drivers->next = newcon;
}
- if (console->flags & CON_PRINTBUFFER) {
+ if (newcon->flags & CON_PRINTBUFFER) {
/*
* release_console_sem() will print out the buffered messages
* for us.
@@ -1245,6 +1282,28 @@ void register_console(struct console *console)
spin_unlock_irqrestore(&logbuf_lock, flags);
}
release_console_sem();
+
+ /*
+ * By unregistering the bootconsoles after we enable the real console
+ * we get the "console xxx enabled" message on all the consoles -
+ * boot consoles, real consoles, etc - this is to ensure that end
+ * users know there might be something in the kernel's log buffer that
+ * went to the bootconsole (that they do not see on the real console)
+ */
+ if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
+ /* we need to iterate through twice, to make sure we print
+ * everything out, before we unregister the console(s)
+ */
+ printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
+ newcon->name, newcon->index);
+ for_each_console(bcon)
+ if (bcon->flags & CON_BOOT)
+ unregister_console(bcon);
+ } else {
+ printk(KERN_INFO "%sconsole [%s%d] enabled\n",
+ (newcon->flags & CON_BOOT) ? "boot" : "" ,
+ newcon->name, newcon->index);
+ }
}
EXPORT_SYMBOL(register_console);
@@ -1287,11 +1346,13 @@ EXPORT_SYMBOL(unregister_console);
static int __init disable_boot_consoles(void)
{
- if (console_drivers != NULL) {
- if (console_drivers->flags & CON_BOOT) {
+ struct console *con;
+
+ for_each_console(con) {
+ if (con->flags & CON_BOOT) {
printk(KERN_INFO "turn off boot console %s%d\n",
- console_drivers->name, console_drivers->index);
- return unregister_console(console_drivers);
+ con->name, con->index);
+ unregister_console(con);
}
}
return 0;
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
deleted file mode 100644
index 0f2b0b311304..000000000000
--- a/kernel/rcuclassic.c
+++ /dev/null
@@ -1,807 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright IBM Corporation, 2001
- *
- * Authors: Dipankar Sarma <dipankar@in.ibm.com>
- * Manfred Spraul <manfred@colorfullife.com>
- *
- * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
- * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
- * Papers:
- * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
- * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU
- *
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <asm/atomic.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/completion.h>
-#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/mutex.h>
-#include <linux/time.h>
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static struct lock_class_key rcu_lock_key;
-struct lockdep_map rcu_lock_map =
- STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
-EXPORT_SYMBOL_GPL(rcu_lock_map);
-#endif
-
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_ctrlblk = {
- .cur = -300,
- .completed = -300,
- .pending = -300,
- .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
- .cpumask = CPU_BITS_NONE,
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
- .cur = -300,
- .completed = -300,
- .pending = -300,
- .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
- .cpumask = CPU_BITS_NONE,
-};
-
-static DEFINE_PER_CPU(struct rcu_data, rcu_data);
-static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
-
-/*
- * Increment the quiescent state counter.
- * The counter is a bit degenerated: We do not need to know
- * how many quiescent states passed, just if there was at least
- * one since the start of the grace period. Thus just a flag.
- */
-void rcu_qsctr_inc(int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
- rdp->passed_quiesc = 1;
-}
-
-void rcu_bh_qsctr_inc(int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
- rdp->passed_quiesc = 1;
-}
-
-static int blimit = 10;
-static int qhimark = 10000;
-static int qlowmark = 100;
-
-#ifdef CONFIG_SMP
-static void force_quiescent_state(struct rcu_data *rdp,
- struct rcu_ctrlblk *rcp)
-{
- int cpu;
- unsigned long flags;
-
- set_need_resched();
- spin_lock_irqsave(&rcp->lock, flags);
- if (unlikely(!rcp->signaled)) {
- rcp->signaled = 1;
- /*
- * Don't send IPI to itself. With irqs disabled,
- * rdp->cpu is the current cpu.
- *
- * cpu_online_mask is updated by the _cpu_down()
- * using __stop_machine(). Since we're in irqs disabled
- * section, __stop_machine() is not exectuting, hence
- * the cpu_online_mask is stable.
- *
- * However, a cpu might have been offlined _just_ before
- * we disabled irqs while entering here.
- * And rcu subsystem might not yet have handled the CPU_DEAD
- * notification, leading to the offlined cpu's bit
- * being set in the rcp->cpumask.
- *
- * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
- * sending smp_reschedule() to an offlined CPU.
- */
- for_each_cpu_and(cpu,
- to_cpumask(rcp->cpumask), cpu_online_mask) {
- if (cpu != rdp->cpu)
- smp_send_reschedule(cpu);
- }
- }
- spin_unlock_irqrestore(&rcp->lock, flags);
-}
-#else
-static inline void force_quiescent_state(struct rcu_data *rdp,
- struct rcu_ctrlblk *rcp)
-{
- set_need_resched();
-}
-#endif
-
-static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
- struct rcu_data *rdp)
-{
- long batch;
-
- head->next = NULL;
- smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
-
- /*
- * Determine the batch number of this callback.
- *
- * Using ACCESS_ONCE to avoid the following error when gcc eliminates
- * local variable "batch" and emits codes like this:
- * 1) rdp->batch = rcp->cur + 1 # gets old value
- * ......
- * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
- * then [*nxttail[0], *nxttail[1]) may contain callbacks
- * that batch# = rdp->batch, see the comment of struct rcu_data.
- */
- batch = ACCESS_ONCE(rcp->cur) + 1;
-
- if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
- /* process callbacks */
- rdp->nxttail[0] = rdp->nxttail[1];
- rdp->nxttail[1] = rdp->nxttail[2];
- if (rcu_batch_after(batch - 1, rdp->batch))
- rdp->nxttail[0] = rdp->nxttail[2];
- }
-
- rdp->batch = batch;
- *rdp->nxttail[2] = head;
- rdp->nxttail[2] = &head->next;
-
- if (unlikely(++rdp->qlen > qhimark)) {
- rdp->blimit = INT_MAX;
- force_quiescent_state(rdp, &rcu_ctrlblk);
- }
-}
-
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-
-static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
-{
- rcp->gp_start = jiffies;
- rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
-}
-
-static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
-{
- int cpu;
- long delta;
- unsigned long flags;
-
- /* Only let one CPU complain about others per time interval. */
-
- spin_lock_irqsave(&rcp->lock, flags);
- delta = jiffies - rcp->jiffies_stall;
- if (delta < 2 || rcp->cur != rcp->completed) {
- spin_unlock_irqrestore(&rcp->lock, flags);
- return;
- }
- rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
- spin_unlock_irqrestore(&rcp->lock, flags);
-
- /* OK, time to rat on our buddy... */
-
- printk(KERN_ERR "INFO: RCU detected CPU stalls:");
- for_each_possible_cpu(cpu) {
- if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
- printk(" %d", cpu);
- }
- printk(" (detected by %d, t=%ld jiffies)\n",
- smp_processor_id(), (long)(jiffies - rcp->gp_start));
-}
-
-static void print_cpu_stall(struct rcu_ctrlblk *rcp)
-{
- unsigned long flags;
-
- printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
- smp_processor_id(), jiffies,
- jiffies - rcp->gp_start);
- dump_stack();
- spin_lock_irqsave(&rcp->lock, flags);
- if ((long)(jiffies - rcp->jiffies_stall) >= 0)
- rcp->jiffies_stall =
- jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
- spin_unlock_irqrestore(&rcp->lock, flags);
- set_need_resched(); /* kick ourselves to get things going. */
-}
-
-static void check_cpu_stall(struct rcu_ctrlblk *rcp)
-{
- long delta;
-
- delta = jiffies - rcp->jiffies_stall;
- if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
- delta >= 0) {
-
- /* We haven't checked in, so go dump stack. */
- print_cpu_stall(rcp);
-
- } else if (rcp->cur != rcp->completed && delta >= 2) {
-
- /* They had two seconds to dump stack, so complain. */
- print_other_cpu_stall(rcp);
- }
-}
-
-#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
-static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
-{
-}
-
-static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
-/**
- * call_rcu - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
- *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- */
-void call_rcu(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu))
-{
- unsigned long flags;
-
- head->func = func;
- local_irq_save(flags);
- __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
-/**
- * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
- *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_bh() assumes
- * that the read-side critical sections end on completion of a softirq
- * handler. This means that read-side critical sections in process
- * context must not be interrupted by softirqs. This interface is to be
- * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by rcu_read_lock() and
- * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
- * and rcu_read_unlock_bh(), if in process context. These may be nested.
- */
-void call_rcu_bh(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu))
-{
- unsigned long flags;
-
- head->func = func;
- local_irq_save(flags);
- __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(call_rcu_bh);
-
-/*
- * Return the number of RCU batches processed thus far. Useful
- * for debug and statistics.
- */
-long rcu_batches_completed(void)
-{
- return rcu_ctrlblk.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
-/*
- * Return the number of RCU batches processed thus far. Useful
- * for debug and statistics.
- */
-long rcu_batches_completed_bh(void)
-{
- return rcu_bh_ctrlblk.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
-
-/* Raises the softirq for processing rcu_callbacks. */
-static inline void raise_rcu_softirq(void)
-{
- raise_softirq(RCU_SOFTIRQ);
-}
-
-/*
- * Invoke the completed RCU callbacks. They are expected to be in
- * a per-cpu list.
- */
-static void rcu_do_batch(struct rcu_data *rdp)
-{
- unsigned long flags;
- struct rcu_head *next, *list;
- int count = 0;
-
- list = rdp->donelist;
- while (list) {
- next = list->next;
- prefetch(next);
- list->func(list);
- list = next;
- if (++count >= rdp->blimit)
- break;
- }
- rdp->donelist = list;
-
- local_irq_save(flags);
- rdp->qlen -= count;
- local_irq_restore(flags);
- if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
- rdp->blimit = blimit;
-
- if (!rdp->donelist)
- rdp->donetail = &rdp->donelist;
- else
- raise_rcu_softirq();
-}
-
-/*
- * Grace period handling:
- * The grace period handling consists out of two steps:
- * - A new grace period is started.
- * This is done by rcu_start_batch. The start is not broadcasted to
- * all cpus, they must pick this up by comparing rcp->cur with
- * rdp->quiescbatch. All cpus are recorded in the
- * rcu_ctrlblk.cpumask bitmap.
- * - All cpus must go through a quiescent state.
- * Since the start of the grace period is not broadcasted, at least two
- * calls to rcu_check_quiescent_state are required:
- * The first call just notices that a new grace period is running. The
- * following calls check if there was a quiescent state since the beginning
- * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
- * the bitmap is empty, then the grace period is completed.
- * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
- * period (if necessary).
- */
-
-/*
- * Register a new batch of callbacks, and start it up if there is currently no
- * active batch and the batch to be registered has not already occurred.
- * Caller must hold rcu_ctrlblk.lock.
- */
-static void rcu_start_batch(struct rcu_ctrlblk *rcp)
-{
- if (rcp->cur != rcp->pending &&
- rcp->completed == rcp->cur) {
- rcp->cur++;
- record_gp_stall_check_time(rcp);
-
- /*
- * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
- * Barrier Otherwise it can cause tickless idle CPUs to be
- * included in rcp->cpumask, which will extend graceperiods
- * unnecessarily.
- */
- smp_mb();
- cpumask_andnot(to_cpumask(rcp->cpumask),
- cpu_online_mask, nohz_cpu_mask);
-
- rcp->signaled = 0;
- }
-}
-
-/*
- * cpu went through a quiescent state since the beginning of the grace period.
- * Clear it from the cpu mask and complete the grace period if it was the last
- * cpu. Start another grace period if someone has further entries pending
- */
-static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
-{
- cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
- if (cpumask_empty(to_cpumask(rcp->cpumask))) {
- /* batch completed ! */
- rcp->completed = rcp->cur;
- rcu_start_batch(rcp);
- }
-}
-
-/*
- * Check if the cpu has gone through a quiescent state (say context
- * switch). If so and if it already hasn't done so in this RCU
- * quiescent cycle, then indicate that it has done so.
- */
-static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
- struct rcu_data *rdp)
-{
- unsigned long flags;
-
- if (rdp->quiescbatch != rcp->cur) {
- /* start new grace period: */
- rdp->qs_pending = 1;
- rdp->passed_quiesc = 0;
- rdp->quiescbatch = rcp->cur;
- return;
- }
-
- /* Grace period already completed for this cpu?
- * qs_pending is checked instead of the actual bitmap to avoid
- * cacheline trashing.
- */
- if (!rdp->qs_pending)
- return;
-
- /*
- * Was there a quiescent state since the beginning of the grace
- * period? If no, then exit and wait for the next call.
- */
- if (!rdp->passed_quiesc)
- return;
- rdp->qs_pending = 0;
-
- spin_lock_irqsave(&rcp->lock, flags);
- /*
- * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
- * during cpu startup. Ignore the quiescent state.
- */
- if (likely(rdp->quiescbatch == rcp->cur))
- cpu_quiet(rdp->cpu, rcp);
-
- spin_unlock_irqrestore(&rcp->lock, flags);
-}
-
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
- * locking requirements, the list it's pulling from has to belong to a cpu
- * which is dead and hence not processing interrupts.
- */
-static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
- struct rcu_head **tail, long batch)
-{
- unsigned long flags;
-
- if (list) {
- local_irq_save(flags);
- this_rdp->batch = batch;
- *this_rdp->nxttail[2] = list;
- this_rdp->nxttail[2] = tail;
- local_irq_restore(flags);
- }
-}
-
-static void __rcu_offline_cpu(struct rcu_data *this_rdp,
- struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
- unsigned long flags;
-
- /*
- * if the cpu going offline owns the grace period
- * we can block indefinitely waiting for it, so flush
- * it here
- */
- spin_lock_irqsave(&rcp->lock, flags);
- if (rcp->cur != rcp->completed)
- cpu_quiet(rdp->cpu, rcp);
- rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
- rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
- spin_unlock(&rcp->lock);
-
- this_rdp->qlen += rdp->qlen;
- local_irq_restore(flags);
-}
-
-static void rcu_offline_cpu(int cpu)
-{
- struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
- struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
-
- __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
- &per_cpu(rcu_data, cpu));
- __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
- &per_cpu(rcu_bh_data, cpu));
- put_cpu_var(rcu_data);
- put_cpu_var(rcu_bh_data);
-}
-
-#else
-
-static void rcu_offline_cpu(int cpu)
-{
-}
-
-#endif
-
-/*
- * This does the RCU processing work from softirq context.
- */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
- struct rcu_data *rdp)
-{
- unsigned long flags;
- long completed_snap;
-
- if (rdp->nxtlist) {
- local_irq_save(flags);
- completed_snap = ACCESS_ONCE(rcp->completed);
-
- /*
- * move the other grace-period-completed entries to
- * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
- */
- if (!rcu_batch_before(completed_snap, rdp->batch))
- rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
- else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
- rdp->nxttail[0] = rdp->nxttail[1];
-
- /*
- * the grace period for entries in
- * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
- * move these entries to donelist
- */
- if (rdp->nxttail[0] != &rdp->nxtlist) {
- *rdp->donetail = rdp->nxtlist;
- rdp->donetail = rdp->nxttail[0];
- rdp->nxtlist = *rdp->nxttail[0];
- *rdp->donetail = NULL;
-
- if (rdp->nxttail[1] == rdp->nxttail[0])
- rdp->nxttail[1] = &rdp->nxtlist;
- if (rdp->nxttail[2] == rdp->nxttail[0])
- rdp->nxttail[2] = &rdp->nxtlist;
- rdp->nxttail[0] = &rdp->nxtlist;
- }
-
- local_irq_restore(flags);
-
- if (rcu_batch_after(rdp->batch, rcp->pending)) {
- unsigned long flags2;
-
- /* and start it/schedule start if it's a new batch */
- spin_lock_irqsave(&rcp->lock, flags2);
- if (rcu_batch_after(rdp->batch, rcp->pending)) {
- rcp->pending = rdp->batch;
- rcu_start_batch(rcp);
- }
- spin_unlock_irqrestore(&rcp->lock, flags2);
- }
- }
-
- rcu_check_quiescent_state(rcp, rdp);
- if (rdp->donelist)
- rcu_do_batch(rdp);
-}
-
-static void rcu_process_callbacks(struct softirq_action *unused)
-{
- /*
- * Memory references from any prior RCU read-side critical sections
- * executed by the interrupted code must be see before any RCU
- * grace-period manupulations below.
- */
-
- smp_mb(); /* See above block comment. */
-
- __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
- __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
-
- /*
- * Memory references from any later RCU read-side critical sections
- * executed by the interrupted code must be see after any RCU
- * grace-period manupulations above.
- */
-
- smp_mb(); /* See above block comment. */
-}
-
-static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
- /* Check for CPU stalls, if enabled. */
- check_cpu_stall(rcp);
-
- if (rdp->nxtlist) {
- long completed_snap = ACCESS_ONCE(rcp->completed);
-
- /*
- * This cpu has pending rcu entries and the grace period
- * for them has completed.
- */
- if (!rcu_batch_before(completed_snap, rdp->batch))
- return 1;
- if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
- rdp->nxttail[0] != rdp->nxttail[1])
- return 1;
- if (rdp->nxttail[0] != &rdp->nxtlist)
- return 1;
-
- /*
- * This cpu has pending rcu entries and the new batch
- * for then hasn't been started nor scheduled start
- */
- if (rcu_batch_after(rdp->batch, rcp->pending))
- return 1;
- }
-
- /* This cpu has finished callbacks to invoke */
- if (rdp->donelist)
- return 1;
-
- /* The rcu core waits for a quiescent state from the cpu */
- if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
- return 1;
-
- /* nothing to do */
- return 0;
-}
-
-/*
- * Check to see if there is any immediate RCU-related work to be done
- * by the current CPU, returning 1 if so. This function is part of the
- * RCU implementation; it is -not- an exported member of the RCU API.
- */
-int rcu_pending(int cpu)
-{
- return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
- __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
-}
-
-/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so. This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
- */
-int rcu_needs_cpu(int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
- struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
-
- return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
-}
-
-/*
- * Top-level function driving RCU grace-period detection, normally
- * invoked from the scheduler-clock interrupt. This function simply
- * increments counters that are read only from softirq by this same
- * CPU, so there are no memory barriers required.
- */
-void rcu_check_callbacks(int cpu, int user)
-{
- if (user ||
- (idle_cpu(cpu) && rcu_scheduler_active &&
- !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
-
- /*
- * Get here if this CPU took its interrupt from user
- * mode or from the idle loop, and if this is not a
- * nested interrupt. In this case, the CPU is in
- * a quiescent state, so count it.
- *
- * Also do a memory barrier. This is needed to handle
- * the case where writes from a preempt-disable section
- * of code get reordered into schedule() by this CPU's
- * write buffer. The memory barrier makes sure that
- * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
- * by other CPUs to happen after any such write.
- */
-
- smp_mb(); /* See above block comment. */
- rcu_qsctr_inc(cpu);
- rcu_bh_qsctr_inc(cpu);
-
- } else if (!in_softirq()) {
-
- /*
- * Get here if this CPU did not take its interrupt from
- * softirq, in other words, if it is not interrupting
- * a rcu_bh read-side critical section. This is an _bh
- * critical section, so count it. The memory barrier
- * is needed for the same reason as is the above one.
- */
-
- smp_mb(); /* See above block comment. */
- rcu_bh_qsctr_inc(cpu);
- }
- raise_rcu_softirq();
-}
-
-static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
- struct rcu_data *rdp)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rcp->lock, flags);
- memset(rdp, 0, sizeof(*rdp));
- rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
- rdp->donetail = &rdp->donelist;
- rdp->quiescbatch = rcp->completed;
- rdp->qs_pending = 0;
- rdp->cpu = cpu;
- rdp->blimit = blimit;
- spin_unlock_irqrestore(&rcp->lock, flags);
-}
-
-static void __cpuinit rcu_online_cpu(int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
- struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
-
- rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
- rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-}
-
-static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- long cpu = (long)hcpu;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- rcu_online_cpu(cpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- rcu_offline_cpu(cpu);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata rcu_nb = {
- .notifier_call = rcu_cpu_notify,
-};
-
-/*
- * Initializes rcu mechanism. Assumed to be called early.
- * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
- * Note that rcu_qsctr and friends are implicitly
- * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
- */
-void __init __rcu_init(void)
-{
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
- printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
- rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
- (void *)(long)smp_processor_id());
- /* Register notifier for non-boot CPUs */
- register_cpu_notifier(&rcu_nb);
-}
-
-module_param(blimit, int, 0);
-module_param(qhimark, int, 0);
-module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a967c9feb90a..8df115600c2d 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -98,6 +98,30 @@ void synchronize_rcu(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
+/**
+ * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu_bh grace
+ * period has elapsed, in other words after all currently executing rcu_bh
+ * read-side critical sections have completed. RCU read-side critical
+ * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
+ * and may be nested.
+ */
+void synchronize_rcu_bh(void)
+{
+ struct rcu_synchronize rcu;
+
+ if (rcu_blocking_is_gp())
+ return;
+
+ init_completion(&rcu.completion);
+ /* Will wake me after RCU finished. */
+ call_rcu_bh(&rcu.head, wakeme_after_rcu);
+ /* Wait for it. */
+ wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+
static void rcu_barrier_callback(struct rcu_head *notused)
{
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -129,6 +153,7 @@ static void rcu_barrier_func(void *type)
static inline void wait_migrated_callbacks(void)
{
wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
+ smp_mb(); /* In case we didn't sleep. */
}
/*
@@ -192,9 +217,13 @@ static void rcu_migrate_callback(struct rcu_head *notused)
wake_up(&rcu_migrate_wq);
}
+extern int rcu_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu);
+
static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
unsigned long action, void *hcpu)
{
+ rcu_cpu_notify(self, action, hcpu);
if (action == CPU_DYING) {
/*
* preempt_disable() in on_each_cpu() prevents stop_machine(),
@@ -219,8 +248,18 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
void __init rcu_init(void)
{
+ int i;
+
__rcu_init();
- hotcpu_notifier(rcu_barrier_cpu_hotplug, 0);
+ cpu_notifier(rcu_barrier_cpu_hotplug, 0);
+
+ /*
+ * We don't need protection against CPU-hotplug here because
+ * this is called early in boot, before either interrupts
+ * or the scheduler are operational.
+ */
+ for_each_online_cpu(i)
+ rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
}
void rcu_scheduler_starting(void)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index beb0e659adcc..510898a7bd69 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -548,7 +548,7 @@ void rcu_irq_enter(void)
* rcu_irq_exit - Called from exiting Hard irq context.
*
* If the CPU was idle with dynamic ticks active, update the
- * rcu_dyntick_sched.dynticks to put let the RCU handling be
+ * rcu_dyntick_sched.dynticks to let the RCU handling be
* aware that the CPU is going back to idle with no ticks.
*/
void rcu_irq_exit(void)
@@ -849,7 +849,7 @@ rcu_try_flip_waitzero(void)
/* Check to see if the sum of the "last" counters is zero. */
RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
- for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
+ for_each_possible_cpu(cpu)
sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
if (sum != 0) {
RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -1067,12 +1067,6 @@ void rcu_offline_cpu(int cpu)
/* seen -after- acknowledgement. */
}
- RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
- RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
-
- RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
- RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
-
cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
@@ -1417,8 +1411,8 @@ int rcu_pending(int cpu)
return 0;
}
-static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -1439,10 +1433,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata rcu_nb = {
- .notifier_call = rcu_cpu_notify,
-};
-
void __init __rcu_init(void)
{
int cpu;
@@ -1471,23 +1461,6 @@ void __init __rcu_init(void)
rdp->waitschedtail = &rdp->waitschedlist;
rdp->rcu_sched_sleeping = 0;
}
- register_cpu_notifier(&rcu_nb);
-
- /*
- * We don't need protection against CPU-Hotplug here
- * since
- * a) If a CPU comes online while we are iterating over the
- * cpu_online_mask below, we would only end up making a
- * duplicate call to rcu_online_cpu() which sets the corresponding
- * CPU's mask in the rcu_cpu_online_map.
- *
- * b) A CPU cannot go offline at this point in time since the user
- * does not have access to the sysfs interface, nor do we
- * suspend the system.
- */
- for_each_online_cpu(cpu)
- rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
-
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
}
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 7c2665cac172..11640346a507 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -236,12 +236,13 @@ static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
"CPU last cur F M\n");
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
long *flipctr = rcupreempt_flipctr(cpu);
cnt += snprintf(&rcupreempt_trace_buf[cnt],
RCUPREEMPT_TRACE_BUF_SIZE - cnt,
- "%3d %4ld %3ld %d %d\n",
+ "%3d%c %4ld %3ld %d %d\n",
cpu,
+ cpu_is_offline(cpu) ? '!' : ' ',
flipctr[!f],
flipctr[f],
rcupreempt_flip_flag(cpu),
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9b4a975a4b4a..b33db539a8ad 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -257,14 +257,14 @@ struct rcu_torture_ops {
void (*init)(void);
void (*cleanup)(void);
int (*readlock)(void);
- void (*readdelay)(struct rcu_random_state *rrsp);
+ void (*read_delay)(struct rcu_random_state *rrsp);
void (*readunlock)(int idx);
int (*completed)(void);
- void (*deferredfree)(struct rcu_torture *p);
+ void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*cb_barrier)(void);
int (*stats)(char *page);
- int irqcapable;
+ int irq_capable;
char *name;
};
static struct rcu_torture_ops *cur_ops = NULL;
@@ -320,7 +320,7 @@ rcu_torture_cb(struct rcu_head *p)
rp->rtort_mbtest = 0;
rcu_torture_free(rp);
} else
- cur_ops->deferredfree(rp);
+ cur_ops->deferred_free(rp);
}
static void rcu_torture_deferred_free(struct rcu_torture *p)
@@ -329,18 +329,18 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
}
static struct rcu_torture_ops rcu_ops = {
- .init = NULL,
- .cleanup = NULL,
- .readlock = rcu_torture_read_lock,
- .readdelay = rcu_read_delay,
- .readunlock = rcu_torture_read_unlock,
- .completed = rcu_torture_completed,
- .deferredfree = rcu_torture_deferred_free,
- .sync = synchronize_rcu,
- .cb_barrier = rcu_barrier,
- .stats = NULL,
- .irqcapable = 1,
- .name = "rcu"
+ .init = NULL,
+ .cleanup = NULL,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay,
+ .readunlock = rcu_torture_read_unlock,
+ .completed = rcu_torture_completed,
+ .deferred_free = rcu_torture_deferred_free,
+ .sync = synchronize_rcu,
+ .cb_barrier = rcu_barrier,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu"
};
static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -370,18 +370,18 @@ static void rcu_sync_torture_init(void)
}
static struct rcu_torture_ops rcu_sync_ops = {
- .init = rcu_sync_torture_init,
- .cleanup = NULL,
- .readlock = rcu_torture_read_lock,
- .readdelay = rcu_read_delay,
- .readunlock = rcu_torture_read_unlock,
- .completed = rcu_torture_completed,
- .deferredfree = rcu_sync_torture_deferred_free,
- .sync = synchronize_rcu,
- .cb_barrier = NULL,
- .stats = NULL,
- .irqcapable = 1,
- .name = "rcu_sync"
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay,
+ .readunlock = rcu_torture_read_unlock,
+ .completed = rcu_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = synchronize_rcu,
+ .cb_barrier = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu_sync"
};
/*
@@ -432,33 +432,33 @@ static void rcu_bh_torture_synchronize(void)
}
static struct rcu_torture_ops rcu_bh_ops = {
- .init = NULL,
- .cleanup = NULL,
- .readlock = rcu_bh_torture_read_lock,
- .readdelay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_bh_torture_read_unlock,
- .completed = rcu_bh_torture_completed,
- .deferredfree = rcu_bh_torture_deferred_free,
- .sync = rcu_bh_torture_synchronize,
- .cb_barrier = rcu_barrier_bh,
- .stats = NULL,
- .irqcapable = 1,
- .name = "rcu_bh"
+ .init = NULL,
+ .cleanup = NULL,
+ .readlock = rcu_bh_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_bh_torture_read_unlock,
+ .completed = rcu_bh_torture_completed,
+ .deferred_free = rcu_bh_torture_deferred_free,
+ .sync = rcu_bh_torture_synchronize,
+ .cb_barrier = rcu_barrier_bh,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu_bh"
};
static struct rcu_torture_ops rcu_bh_sync_ops = {
- .init = rcu_sync_torture_init,
- .cleanup = NULL,
- .readlock = rcu_bh_torture_read_lock,
- .readdelay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_bh_torture_read_unlock,
- .completed = rcu_bh_torture_completed,
- .deferredfree = rcu_sync_torture_deferred_free,
- .sync = rcu_bh_torture_synchronize,
- .cb_barrier = NULL,
- .stats = NULL,
- .irqcapable = 1,
- .name = "rcu_bh_sync"
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = rcu_bh_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_bh_torture_read_unlock,
+ .completed = rcu_bh_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = rcu_bh_torture_synchronize,
+ .cb_barrier = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu_bh_sync"
};
/*
@@ -530,17 +530,17 @@ static int srcu_torture_stats(char *page)
}
static struct rcu_torture_ops srcu_ops = {
- .init = srcu_torture_init,
- .cleanup = srcu_torture_cleanup,
- .readlock = srcu_torture_read_lock,
- .readdelay = srcu_read_delay,
- .readunlock = srcu_torture_read_unlock,
- .completed = srcu_torture_completed,
- .deferredfree = rcu_sync_torture_deferred_free,
- .sync = srcu_torture_synchronize,
- .cb_barrier = NULL,
- .stats = srcu_torture_stats,
- .name = "srcu"
+ .init = srcu_torture_init,
+ .cleanup = srcu_torture_cleanup,
+ .readlock = srcu_torture_read_lock,
+ .read_delay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock,
+ .completed = srcu_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .cb_barrier = NULL,
+ .stats = srcu_torture_stats,
+ .name = "srcu"
};
/*
@@ -574,32 +574,49 @@ static void sched_torture_synchronize(void)
}
static struct rcu_torture_ops sched_ops = {
- .init = rcu_sync_torture_init,
- .cleanup = NULL,
- .readlock = sched_torture_read_lock,
- .readdelay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = sched_torture_read_unlock,
- .completed = sched_torture_completed,
- .deferredfree = rcu_sched_torture_deferred_free,
- .sync = sched_torture_synchronize,
- .cb_barrier = rcu_barrier_sched,
- .stats = NULL,
- .irqcapable = 1,
- .name = "sched"
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = sched_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = sched_torture_read_unlock,
+ .completed = sched_torture_completed,
+ .deferred_free = rcu_sched_torture_deferred_free,
+ .sync = sched_torture_synchronize,
+ .cb_barrier = rcu_barrier_sched,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "sched"
};
static struct rcu_torture_ops sched_ops_sync = {
- .init = rcu_sync_torture_init,
- .cleanup = NULL,
- .readlock = sched_torture_read_lock,
- .readdelay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = sched_torture_read_unlock,
- .completed = sched_torture_completed,
- .deferredfree = rcu_sync_torture_deferred_free,
- .sync = sched_torture_synchronize,
- .cb_barrier = NULL,
- .stats = NULL,
- .name = "sched_sync"
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = sched_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = sched_torture_read_unlock,
+ .completed = sched_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = sched_torture_synchronize,
+ .cb_barrier = NULL,
+ .stats = NULL,
+ .name = "sched_sync"
+};
+
+extern int rcu_expedited_torture_stats(char *page);
+
+static struct rcu_torture_ops sched_expedited_ops = {
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = sched_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = sched_torture_read_unlock,
+ .completed = sched_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = synchronize_sched_expedited,
+ .cb_barrier = NULL,
+ .stats = rcu_expedited_torture_stats,
+ .irq_capable = 1,
+ .name = "sched_expedited"
};
/*
@@ -635,7 +652,7 @@ rcu_torture_writer(void *arg)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
old_rp->rtort_pipe_count++;
- cur_ops->deferredfree(old_rp);
+ cur_ops->deferred_free(old_rp);
}
rcu_torture_current_version++;
oldbatch = cur_ops->completed();
@@ -700,7 +717,7 @@ static void rcu_torture_timer(unsigned long unused)
if (p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
spin_lock(&rand_lock);
- cur_ops->readdelay(&rand);
+ cur_ops->read_delay(&rand);
n_rcu_torture_timers++;
spin_unlock(&rand_lock);
preempt_disable();
@@ -738,11 +755,11 @@ rcu_torture_reader(void *arg)
VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
set_user_nice(current, 19);
- if (irqreader && cur_ops->irqcapable)
+ if (irqreader && cur_ops->irq_capable)
setup_timer_on_stack(&t, rcu_torture_timer, 0);
do {
- if (irqreader && cur_ops->irqcapable) {
+ if (irqreader && cur_ops->irq_capable) {
if (!timer_pending(&t))
mod_timer(&t, 1);
}
@@ -757,7 +774,7 @@ rcu_torture_reader(void *arg)
}
if (p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
- cur_ops->readdelay(&rand);
+ cur_ops->read_delay(&rand);
preempt_disable();
pipe_count = p->rtort_pipe_count;
if (pipe_count > RCU_TORTURE_PIPE_LEN) {
@@ -778,7 +795,7 @@ rcu_torture_reader(void *arg)
} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
rcutorture_shutdown_absorb("rcu_torture_reader");
- if (irqreader && cur_ops->irqcapable)
+ if (irqreader && cur_ops->irq_capable)
del_timer_sync(&t);
while (!kthread_should_stop())
schedule_timeout_uninterruptible(1);
@@ -1078,6 +1095,7 @@ rcu_torture_init(void)
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] =
{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
+ &sched_expedited_ops,
&srcu_ops, &sched_ops, &sched_ops_sync, };
mutex_lock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7717b95c2027..7e4e116ae216 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -35,6 +35,7 @@
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
+#include <linux/nmi.h>
#include <asm/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
@@ -469,6 +470,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
}
printk(" (detected by %d, t=%ld jiffies)\n",
smp_processor_id(), (long)(jiffies - rsp->gp_start));
+ trigger_all_cpu_backtrace();
+
force_quiescent_state(rsp, 0); /* Kick them all. */
}
@@ -479,12 +482,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
smp_processor_id(), jiffies - rsp->gp_start);
- dump_stack();
+ trigger_all_cpu_backtrace();
+
spin_lock_irqsave(&rnp->lock, flags);
if ((long)(jiffies - rsp->jiffies_stall) >= 0)
rsp->jiffies_stall =
jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
spin_unlock_irqrestore(&rnp->lock, flags);
+
set_need_resched(); /* kick ourselves to get things going. */
}
@@ -1132,6 +1137,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
+ WARN_ON_ONCE(rdp->beenonline == 0);
+
/*
* If an RCU GP has gone long enough, go check for dyntick
* idle CPUs and, if needed, send resched IPIs.
@@ -1325,22 +1332,40 @@ int rcu_needs_cpu(int cpu)
}
/*
- * Initialize a CPU's per-CPU RCU data. We take this "scorched earth"
- * approach so that we don't have to worry about how long the CPU has
- * been gone, or whether it ever was online previously. We do trust the
- * ->mynode field, as it is constant for a given struct rcu_data and
- * initialized during early boot.
- *
- * Note that only one online or offline event can be happening at a given
- * time. Note also that we can accept some slop in the rsp->completed
- * access due to the fact that this CPU cannot possibly have any RCU
- * callbacks in flight yet.
+ * Do boot-time initialization of a CPU's per-CPU RCU data.
+ */
+static void __init
+rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ int i;
+ struct rcu_data *rdp = rsp->rda[cpu];
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ /* Set up local state, ensuring consistent view of global state. */
+ spin_lock_irqsave(&rnp->lock, flags);
+ rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+ rdp->nxtlist = NULL;
+ for (i = 0; i < RCU_NEXT_SIZE; i++)
+ rdp->nxttail[i] = &rdp->nxtlist;
+ rdp->qlen = 0;
+#ifdef CONFIG_NO_HZ
+ rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
+#endif /* #ifdef CONFIG_NO_HZ */
+ rdp->cpu = cpu;
+ spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Initialize a CPU's per-CPU RCU data. Note that only one online or
+ * offline event can be happening at a given time. Note also that we
+ * can accept some slop in the rsp->completed access due to the fact
+ * that this CPU cannot possibly have any RCU callbacks in flight yet.
*/
static void __cpuinit
rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
- int i;
long lastcomp;
unsigned long mask;
struct rcu_data *rdp = rsp->rda[cpu];
@@ -1355,16 +1380,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->qs_pending = 1; /* so set up to respond to current GP. */
rdp->beenonline = 1; /* We have now been online. */
rdp->passed_quiesc_completed = lastcomp - 1;
- rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
- rdp->nxtlist = NULL;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- rdp->nxttail[i] = &rdp->nxtlist;
- rdp->qlen = 0;
rdp->blimit = blimit;
-#ifdef CONFIG_NO_HZ
- rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
-#endif /* #ifdef CONFIG_NO_HZ */
- rdp->cpu = cpu;
spin_unlock(&rnp->lock); /* irqs remain disabled. */
/*
@@ -1407,14 +1423,13 @@ static void __cpuinit rcu_online_cpu(int cpu)
{
rcu_init_percpu_data(cpu, &rcu_state);
rcu_init_percpu_data(cpu, &rcu_bh_state);
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
}
/*
* Handle CPU online/offline notifcation events.
*/
-static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -1523,10 +1538,6 @@ do { \
} \
} while (0)
-static struct notifier_block __cpuinitdata rcu_nb = {
- .notifier_call = rcu_cpu_notify,
-};
-
void __init __rcu_init(void)
{
int i; /* All used by RCU_DATA_PTR_INIT(). */
@@ -1539,13 +1550,13 @@ void __init __rcu_init(void)
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
rcu_init_one(&rcu_state);
RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
+ for_each_possible_cpu(i)
+ rcu_boot_init_percpu_data(i, &rcu_state);
rcu_init_one(&rcu_bh_state);
RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
-
- for_each_online_cpu(i)
- rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
- /* Register notifier for non-boot CPUs */
- register_cpu_notifier(&rcu_nb);
+ for_each_possible_cpu(i)
+ rcu_boot_init_percpu_data(i, &rcu_bh_state);
+ open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
}
module_param(blimit, int, 0);
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b59e265273b..5184580613b7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -616,6 +616,7 @@ struct rq {
unsigned char idle_at_tick;
/* For active balancing */
+ int post_schedule;
int active_balance;
int push_cpu;
/* cpu of this runqueue: */
@@ -693,6 +694,7 @@ static inline int cpu_of(struct rq *rq)
#define this_rq() (&__get_cpu_var(runqueues))
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+#define raw_rq() (&__raw_get_cpu_var(runqueues))
inline void update_rq_clock(struct rq *rq)
{
@@ -1522,13 +1524,18 @@ static void
update_group_shares_cpu(struct task_group *tg, int cpu,
unsigned long sd_shares, unsigned long sd_rq_weight)
{
- unsigned long shares;
unsigned long rq_weight;
+ unsigned long shares;
+ int boost = 0;
if (!tg->se[cpu])
return;
rq_weight = tg->cfs_rq[cpu]->rq_weight;
+ if (!rq_weight) {
+ boost = 1;
+ rq_weight = NICE_0_LOAD;
+ }
/*
* \Sum shares * rq_weight
@@ -1545,8 +1552,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
unsigned long flags;
spin_lock_irqsave(&rq->lock, flags);
- tg->cfs_rq[cpu]->shares = shares;
-
+ tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
__set_se_shares(tg->se[cpu], shares);
spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1559,7 +1565,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
*/
static int tg_shares_up(struct task_group *tg, void *data)
{
- unsigned long weight, rq_weight = 0;
+ unsigned long weight, rq_weight = 0, eff_weight = 0;
unsigned long shares = 0;
struct sched_domain *sd = data;
int i;
@@ -1571,11 +1577,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
* run here it will not get delayed by group starvation.
*/
weight = tg->cfs_rq[i]->load.weight;
+ tg->cfs_rq[i]->rq_weight = weight;
+ rq_weight += weight;
+
if (!weight)
weight = NICE_0_LOAD;
- tg->cfs_rq[i]->rq_weight = weight;
- rq_weight += weight;
+ eff_weight += weight;
shares += tg->cfs_rq[i]->shares;
}
@@ -1585,8 +1593,14 @@ static int tg_shares_up(struct task_group *tg, void *data)
if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
shares = tg->shares;
- for_each_cpu(i, sched_domain_span(sd))
- update_group_shares_cpu(tg, i, shares, rq_weight);
+ for_each_cpu(i, sched_domain_span(sd)) {
+ unsigned long sd_rq_weight = rq_weight;
+
+ if (!tg->cfs_rq[i]->rq_weight)
+ sd_rq_weight = eff_weight;
+
+ update_group_shares_cpu(tg, i, shares, sd_rq_weight);
+ }
return 0;
}
@@ -1616,8 +1630,14 @@ static int tg_load_down(struct task_group *tg, void *data)
static void update_shares(struct sched_domain *sd)
{
- u64 now = cpu_clock(raw_smp_processor_id());
- s64 elapsed = now - sd->last_update;
+ s64 elapsed;
+ u64 now;
+
+ if (root_task_group_empty())
+ return;
+
+ now = cpu_clock(raw_smp_processor_id());
+ elapsed = now - sd->last_update;
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
sd->last_update = now;
@@ -1627,6 +1647,9 @@ static void update_shares(struct sched_domain *sd)
static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
{
+ if (root_task_group_empty())
+ return;
+
spin_unlock(&rq->lock);
update_shares(sd);
spin_lock(&rq->lock);
@@ -1634,6 +1657,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
static void update_h_load(long cpu)
{
+ if (root_task_group_empty())
+ return;
+
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
@@ -2637,9 +2663,32 @@ void sched_fork(struct task_struct *p, int clone_flags)
set_task_cpu(p, cpu);
/*
- * Make sure we do not leak PI boosting priority to the child:
+ * Make sure we do not leak PI boosting priority to the child.
*/
p->prio = current->normal_prio;
+
+ /*
+ * Revert to default priority/policy on fork if requested.
+ */
+ if (unlikely(p->sched_reset_on_fork)) {
+ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
+ p->policy = SCHED_NORMAL;
+
+ if (p->normal_prio < DEFAULT_PRIO)
+ p->prio = DEFAULT_PRIO;
+
+ if (PRIO_TO_NICE(p->static_prio) < 0) {
+ p->static_prio = NICE_TO_PRIO(0);
+ set_load_weight(p);
+ }
+
+ /*
+ * We don't need the reset flag anymore after the fork. It has
+ * fulfilled its duty:
+ */
+ p->sched_reset_on_fork = 0;
+ }
+
if (!rt_prio(p->prio))
p->sched_class = &fair_sched_class;
@@ -2796,12 +2845,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
{
struct mm_struct *mm = rq->prev_mm;
long prev_state;
-#ifdef CONFIG_SMP
- int post_schedule = 0;
-
- if (current->sched_class->needs_post_schedule)
- post_schedule = current->sched_class->needs_post_schedule(rq);
-#endif
rq->prev_mm = NULL;
@@ -2820,10 +2863,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
finish_arch_switch(prev);
perf_counter_task_sched_in(current, cpu_of(rq));
finish_lock_switch(rq, prev);
-#ifdef CONFIG_SMP
- if (post_schedule)
- current->sched_class->post_schedule(rq);
-#endif
fire_sched_in_preempt_notifiers(current);
if (mm)
@@ -2838,6 +2877,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
}
}
+#ifdef CONFIG_SMP
+
+/* assumes rq->lock is held */
+static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
+{
+ if (prev->sched_class->pre_schedule)
+ prev->sched_class->pre_schedule(rq, prev);
+}
+
+/* rq->lock is NOT held, but preemption is disabled */
+static inline void post_schedule(struct rq *rq)
+{
+ if (rq->post_schedule) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+ if (rq->curr->sched_class->post_schedule)
+ rq->curr->sched_class->post_schedule(rq);
+ spin_unlock_irqrestore(&rq->lock, flags);
+
+ rq->post_schedule = 0;
+ }
+}
+
+#else
+
+static inline void pre_schedule(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void post_schedule(struct rq *rq)
+{
+}
+
+#endif
+
/**
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
@@ -2848,6 +2923,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
struct rq *rq = this_rq();
finish_task_switch(rq, prev);
+
+ /*
+ * FIXME: do we need to worry about rq being invalidated by the
+ * task_switch?
+ */
+ post_schedule(rq);
+
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
/* In this case, finish_task_switch does not reenable preemption */
preempt_enable();
@@ -5031,17 +5113,16 @@ void account_idle_time(cputime_t cputime)
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
- cputime_t one_jiffy = jiffies_to_cputime(1);
- cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
struct rq *rq = this_rq();
if (user_tick)
- account_user_time(p, one_jiffy, one_jiffy_scaled);
+ account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
- account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
+ account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
one_jiffy_scaled);
else
- account_idle_time(one_jiffy);
+ account_idle_time(cputime_one_jiffy);
}
/*
@@ -5349,10 +5430,7 @@ need_resched_nonpreemptible:
switch_count = &prev->nvcsw;
}
-#ifdef CONFIG_SMP
- if (prev->sched_class->pre_schedule)
- prev->sched_class->pre_schedule(rq, prev);
-#endif
+ pre_schedule(rq, prev);
if (unlikely(!rq->nr_running))
idle_balance(cpu, rq);
@@ -5378,6 +5456,8 @@ need_resched_nonpreemptible:
} else
spin_unlock_irq(&rq->lock);
+ post_schedule(rq);
+
if (unlikely(reacquire_kernel_lock(current) < 0))
goto need_resched_nonpreemptible;
@@ -6123,17 +6203,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
unsigned long flags;
const struct sched_class *prev_class = p->sched_class;
struct rq *rq;
+ int reset_on_fork;
/* may grab non-irq protected spin_locks */
BUG_ON(in_interrupt());
recheck:
/* double check policy once rq lock held */
- if (policy < 0)
+ if (policy < 0) {
+ reset_on_fork = p->sched_reset_on_fork;
policy = oldpolicy = p->policy;
- else if (policy != SCHED_FIFO && policy != SCHED_RR &&
- policy != SCHED_NORMAL && policy != SCHED_BATCH &&
- policy != SCHED_IDLE)
- return -EINVAL;
+ } else {
+ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+ policy &= ~SCHED_RESET_ON_FORK;
+
+ if (policy != SCHED_FIFO && policy != SCHED_RR &&
+ policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+ policy != SCHED_IDLE)
+ return -EINVAL;
+ }
+
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6177,6 +6265,10 @@ recheck:
/* can't change other user's priorities */
if (!check_same_owner(p))
return -EPERM;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ return -EPERM;
}
if (user) {
@@ -6220,6 +6312,8 @@ recheck:
if (running)
p->sched_class->put_prev_task(rq, p);
+ p->sched_reset_on_fork = reset_on_fork;
+
oldprio = p->prio;
__setscheduler(rq, p, policy, param->sched_priority);
@@ -6336,14 +6430,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
if (p) {
retval = security_task_getscheduler(p);
if (!retval)
- retval = p->policy;
+ retval = p->policy
+ | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
}
read_unlock(&tasklist_lock);
return retval;
}
/**
- * sys_sched_getscheduler - get the RT priority of a thread
+ * sys_sched_getparam - get the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the RT priority.
*/
@@ -6571,19 +6666,9 @@ static inline int should_resched(void)
static void __cond_resched(void)
{
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
- __might_sleep(__FILE__, __LINE__);
-#endif
- /*
- * The BKS might be reacquired before we have dropped
- * PREEMPT_ACTIVE, which could trigger a second
- * cond_resched() call.
- */
- do {
- add_preempt_count(PREEMPT_ACTIVE);
- schedule();
- sub_preempt_count(PREEMPT_ACTIVE);
- } while (need_resched());
+ add_preempt_count(PREEMPT_ACTIVE);
+ schedule();
+ sub_preempt_count(PREEMPT_ACTIVE);
}
int __sched _cond_resched(void)
@@ -6597,18 +6682,20 @@ int __sched _cond_resched(void)
EXPORT_SYMBOL(_cond_resched);
/*
- * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
* call schedule, and on return reacquire the lock.
*
* This works OK both with and without CONFIG_PREEMPT. We do strange low-level
* operations here to prevent schedule() from being called twice (once via
* spin_unlock(), once by hand).
*/
-int cond_resched_lock(spinlock_t *lock)
+int __cond_resched_lock(spinlock_t *lock)
{
int resched = should_resched();
int ret = 0;
+ lockdep_assert_held(lock);
+
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
if (resched)
@@ -6620,9 +6707,9 @@ int cond_resched_lock(spinlock_t *lock)
}
return ret;
}
-EXPORT_SYMBOL(cond_resched_lock);
+EXPORT_SYMBOL(__cond_resched_lock);
-int __sched cond_resched_softirq(void)
+int __sched __cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
@@ -6634,7 +6721,7 @@ int __sched cond_resched_softirq(void)
}
return 0;
}
-EXPORT_SYMBOL(cond_resched_softirq);
+EXPORT_SYMBOL(__cond_resched_softirq);
/**
* yield - yield the current processor to other threads.
@@ -6658,7 +6745,7 @@ EXPORT_SYMBOL(yield);
*/
void __sched io_schedule(void)
{
- struct rq *rq = &__raw_get_cpu_var(runqueues);
+ struct rq *rq = raw_rq();
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
@@ -6670,7 +6757,7 @@ EXPORT_SYMBOL(io_schedule);
long __sched io_schedule_timeout(long timeout)
{
- struct rq *rq = &__raw_get_cpu_var(runqueues);
+ struct rq *rq = raw_rq();
long ret;
delayacct_blkio_start();
@@ -6992,8 +7079,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
/* Need help from migration thread: drop lock and wait. */
+ struct task_struct *mt = rq->migration_thread;
+
+ get_task_struct(mt);
task_rq_unlock(rq, &flags);
wake_up_process(rq->migration_thread);
+ put_task_struct(mt);
wait_for_completion(&req.done);
tlb_migrate_finish(p->mm);
return 0;
@@ -7051,6 +7142,11 @@ fail:
return ret;
}
+#define RCU_MIGRATION_IDLE 0
+#define RCU_MIGRATION_NEED_QS 1
+#define RCU_MIGRATION_GOT_QS 2
+#define RCU_MIGRATION_MUST_SYNC 3
+
/*
* migration_thread - this is a highprio system thread that performs
* thread migration by bumping thread off CPU then 'pushing' onto
@@ -7058,6 +7154,7 @@ fail:
*/
static int migration_thread(void *data)
{
+ int badcpu;
int cpu = (long)data;
struct rq *rq;
@@ -7092,8 +7189,17 @@ static int migration_thread(void *data)
req = list_entry(head->next, struct migration_req, list);
list_del_init(head->next);
- spin_unlock(&rq->lock);
- __migrate_task(req->task, cpu, req->dest_cpu);
+ if (req->task != NULL) {
+ spin_unlock(&rq->lock);
+ __migrate_task(req->task, cpu, req->dest_cpu);
+ } else if (likely(cpu == (badcpu = smp_processor_id()))) {
+ req->dest_cpu = RCU_MIGRATION_GOT_QS;
+ spin_unlock(&rq->lock);
+ } else {
+ req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
+ spin_unlock(&rq->lock);
+ WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
+ }
local_irq_enable();
complete(&req->done);
@@ -7625,7 +7731,7 @@ static int __init migration_init(void)
migration_call(&migration_notifier, CPU_ONLINE, cpu);
register_cpu_notifier(&migration_notifier);
- return err;
+ return 0;
}
early_initcall(migration_init);
#endif
@@ -7841,7 +7947,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
rq->rd = rd;
cpumask_set_cpu(rq->cpu, rd->span);
- if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
+ if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
spin_unlock_irqrestore(&rq->lock, flags);
@@ -9334,6 +9440,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
+ rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
@@ -9398,13 +9505,20 @@ void __init sched_init(void)
}
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-void __might_sleep(char *file, int line)
+static inline int preempt_count_equals(int preempt_offset)
+{
+ int nested = preempt_count() & ~PREEMPT_ACTIVE;
+
+ return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+}
+
+void __might_sleep(char *file, int line, int preempt_offset)
{
#ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */
- if ((!in_atomic() && !irqs_disabled()) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+ system_state != SYSTEM_RUNNING || oops_in_progress)
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
@@ -10581,3 +10695,113 @@ struct cgroup_subsys cpuacct_subsys = {
.subsys_id = cpuacct_subsys_id,
};
#endif /* CONFIG_CGROUP_CPUACCT */
+
+#ifndef CONFIG_SMP
+
+int rcu_expedited_torture_stats(char *page)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
+
+void synchronize_sched_expedited(void)
+{
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
+static DEFINE_MUTEX(rcu_sched_expedited_mutex);
+
+#define RCU_EXPEDITED_STATE_POST -2
+#define RCU_EXPEDITED_STATE_IDLE -1
+
+static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+
+int rcu_expedited_torture_stats(char *page)
+{
+ int cnt = 0;
+ int cpu;
+
+ cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
+ for_each_online_cpu(cpu) {
+ cnt += sprintf(&page[cnt], " %d:%d",
+ cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
+ }
+ cnt += sprintf(&page[cnt], "\n");
+ return cnt;
+}
+EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
+
+static long synchronize_sched_expedited_count;
+
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly. This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier. Failing to
+ * observe this restriction will result in deadlock.
+ */
+void synchronize_sched_expedited(void)
+{
+ int cpu;
+ unsigned long flags;
+ bool need_full_sync = 0;
+ struct rq *rq;
+ struct migration_req *req;
+ long snap;
+ int trycount = 0;
+
+ smp_mb(); /* ensure prior mod happens before capturing snap. */
+ snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+ get_online_cpus();
+ while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+ put_online_cpus();
+ if (trycount++ < 10)
+ udelay(trycount * num_online_cpus());
+ else {
+ synchronize_sched();
+ return;
+ }
+ if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+ smp_mb(); /* ensure test happens before caller kfree */
+ return;
+ }
+ get_online_cpus();
+ }
+ rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
+ for_each_online_cpu(cpu) {
+ rq = cpu_rq(cpu);
+ req = &per_cpu(rcu_migration_req, cpu);
+ init_completion(&req->done);
+ req->task = NULL;
+ req->dest_cpu = RCU_MIGRATION_NEED_QS;
+ spin_lock_irqsave(&rq->lock, flags);
+ list_add(&req->list, &rq->migration_queue);
+ spin_unlock_irqrestore(&rq->lock, flags);
+ wake_up_process(rq->migration_thread);
+ }
+ for_each_online_cpu(cpu) {
+ rcu_expedited_state = cpu;
+ req = &per_cpu(rcu_migration_req, cpu);
+ rq = cpu_rq(cpu);
+ wait_for_completion(&req->done);
+ spin_lock_irqsave(&rq->lock, flags);
+ if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
+ need_full_sync = 1;
+ req->dest_cpu = RCU_MIGRATION_IDLE;
+ spin_unlock_irqrestore(&rq->lock, flags);
+ }
+ rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+ mutex_unlock(&rcu_sched_expedited_mutex);
+ put_online_cpus();
+ if (need_full_sync)
+ synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index d014efbf947a..0f052fc674d5 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
/*
* If the cpu was currently mapped to a different value, we
- * first need to unmap the old value
+ * need to map it to the new value then remove the old value.
+ * Note, we must add the new value first, otherwise we risk the
+ * cpu being cleared from pri_active, and this cpu could be
+ * missed for a push or pull.
*/
- if (likely(oldpri != CPUPRI_INVALID)) {
- struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
-
- spin_lock_irqsave(&vec->lock, flags);
-
- vec->count--;
- if (!vec->count)
- clear_bit(oldpri, cp->pri_active);
- cpumask_clear_cpu(cpu, vec->mask);
-
- spin_unlock_irqrestore(&vec->lock, flags);
- }
-
if (likely(newpri != CPUPRI_INVALID)) {
struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
@@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
spin_unlock_irqrestore(&vec->lock, flags);
}
+ if (likely(oldpri != CPUPRI_INVALID)) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
+
+ spin_lock_irqsave(&vec->lock, flags);
+
+ vec->count--;
+ if (!vec->count)
+ clear_bit(oldpri, cp->pri_active);
+ cpumask_clear_cpu(cpu, vec->mask);
+
+ spin_unlock_irqrestore(&vec->lock, flags);
+ }
*currpri = newpri;
}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 652e8bdef9aa..342000b31ad6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class;
* CFS operations on generic schedulable entities:
*/
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
- return container_of(se, struct task_struct, se);
-}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
/* cpu runqueue to which this cfs_rq is attached */
@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ WARN_ON_ONCE(!entity_is_task(se));
+#endif
+ return container_of(se, struct task_struct, se);
+}
+
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
}
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+ return container_of(se, struct task_struct, se);
+}
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
@@ -1046,17 +1054,21 @@ static void yield_task_fair(struct rq *rq)
* search starts with cpus closest then further out as needed,
* so we always favor a closer, idle cpu.
* Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (cpu_active_mask)
+ * hence we need to mask them out (rq->rd->online)
*
* Returns the CPU we should wake onto.
*/
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
+
+#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
+
static int wake_idle(int cpu, struct task_struct *p)
{
struct sched_domain *sd;
int i;
unsigned int chosen_wakeup_cpu;
int this_cpu;
+ struct rq *task_rq = task_rq(p);
/*
* At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
@@ -1089,10 +1101,10 @@ static int wake_idle(int cpu, struct task_struct *p)
for_each_domain(cpu, sd) {
if ((sd->flags & SD_WAKE_IDLE)
|| ((sd->flags & SD_WAKE_IDLE_FAR)
- && !task_hot(p, task_rq(p)->clock, sd))) {
+ && !task_hot(p, task_rq->clock, sd))) {
for_each_cpu_and(i, sched_domain_span(sd),
&p->cpus_allowed) {
- if (cpu_active(i) && idle_cpu(i)) {
+ if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
if (i != task_cpu(p)) {
schedstat_inc(p,
se.nr_wakeups_idle);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3918e01994e0..3d4020a9ba1b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,15 +3,18 @@
* policies)
*/
+#ifdef CONFIG_RT_GROUP_SCHED
+
+#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
+
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{
+#ifdef CONFIG_SCHED_DEBUG
+ WARN_ON_ONCE(!rt_entity_is_task(rt_se));
+#endif
return container_of(rt_se, struct task_struct, rt);
}
-#ifdef CONFIG_RT_GROUP_SCHED
-
-#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
-
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
return rt_rq->rq;
@@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
#define rt_entity_is_task(rt_se) (1)
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+ return container_of(rt_se, struct task_struct, rt);
+}
+
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
return container_of(rt_rq, struct rq, rt);
@@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
}
+static inline int has_pushable_tasks(struct rq *rq)
+{
+ return !plist_head_empty(&rq->rt.pushable_tasks);
+}
+
#else
static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -1064,6 +1077,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
if (p)
dequeue_pushable_task(rq, p);
+#ifdef CONFIG_SMP
+ /*
+ * We detect this state here so that we can avoid taking the RQ
+ * lock again later if there is no need to push
+ */
+ rq->post_schedule = has_pushable_tasks(rq);
+#endif
+
return p;
}
@@ -1162,13 +1183,6 @@ static int find_lowest_rq(struct task_struct *task)
return -1; /* No targets found */
/*
- * Only consider CPUs that are usable for migration.
- * I guess we might want to change cpupri_find() to ignore those
- * in the first place.
- */
- cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
-
- /*
* At this point we have built a mask of cpus representing the
* lowest priority tasks in the system. Now we want to elect
* the best one based on our affinity and topology.
@@ -1262,11 +1276,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
return lowest_rq;
}
-static inline int has_pushable_tasks(struct rq *rq)
-{
- return !plist_head_empty(&rq->rt.pushable_tasks);
-}
-
static struct task_struct *pick_next_pushable_task(struct rq *rq)
{
struct task_struct *p;
@@ -1466,23 +1475,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
pull_rt_task(rq);
}
-/*
- * assumes rq->lock is held
- */
-static int needs_post_schedule_rt(struct rq *rq)
-{
- return has_pushable_tasks(rq);
-}
-
static void post_schedule_rt(struct rq *rq)
{
- /*
- * This is only called if needs_post_schedule_rt() indicates that
- * we need to push tasks away
- */
- spin_lock_irq(&rq->lock);
push_rt_tasks(rq);
- spin_unlock_irq(&rq->lock);
}
/*
@@ -1758,7 +1753,6 @@ static const struct sched_class rt_sched_class = {
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
.pre_schedule = pre_schedule_rt,
- .needs_post_schedule = needs_post_schedule_rt,
.post_schedule = post_schedule_rt,
.task_wake_up = task_wake_up_rt,
.switched_from = switched_from_rt,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e8c77d9c633a..02c0b2c9c674 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -125,6 +125,75 @@ void getnstimeofday(struct timespec *ts)
EXPORT_SYMBOL(getnstimeofday);
+ktime_t ktime_get(void)
+{
+ cycle_t cycle_now, cycle_delta;
+ unsigned int seq;
+ s64 secs, nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
+ nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ /* convert to nanoseconds: */
+ nsecs += cyc2ns(clock, cycle_delta);
+
+ } while (read_seqretry(&xtime_lock, seq));
+ /*
+ * Use ktime_set/ktime_add_ns to create a proper ktime on
+ * 32-bit architectures without CONFIG_KTIME_SCALAR.
+ */
+ return ktime_add_ns(ktime_set(secs, 0), nsecs);
+}
+EXPORT_SYMBOL_GPL(ktime_get);
+
+/**
+ * ktime_get_ts - get the monotonic clock in timespec format
+ * @ts: pointer to timespec variable
+ *
+ * The function calculates the monotonic clock from the realtime
+ * clock and the wall_to_monotonic offset and stores the result
+ * in normalized timespec format in the variable pointed to by @ts.
+ */
+void ktime_get_ts(struct timespec *ts)
+{
+ cycle_t cycle_now, cycle_delta;
+ struct timespec tomono;
+ unsigned int seq;
+ s64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ *ts = xtime;
+ tomono = wall_to_monotonic;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ /* convert to nanoseconds: */
+ nsecs = cyc2ns(clock, cycle_delta);
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
+ ts->tv_nsec + tomono.tv_nsec + nsecs);
+}
+EXPORT_SYMBOL_GPL(ktime_get_ts);
+
/**
* do_gettimeofday - Returns the time of day in a timeval
* @tv: pointer to the timeval to be set
@@ -221,10 +290,65 @@ static void change_clocksource(void)
clock->name);
*/
}
-#else
+#else /* GENERIC_TIME */
static inline void clocksource_forward_now(void) { }
static inline void change_clocksource(void) { }
-#endif
+
+/**
+ * ktime_get - get the monotonic time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+ktime_t ktime_get(void)
+{
+ struct timespec now;
+
+ ktime_get_ts(&now);
+
+ return timespec_to_ktime(now);
+}
+EXPORT_SYMBOL_GPL(ktime_get);
+
+/**
+ * ktime_get_ts - get the monotonic clock in timespec format
+ * @ts: pointer to timespec variable
+ *
+ * The function calculates the monotonic clock from the realtime
+ * clock and the wall_to_monotonic offset and stores the result
+ * in normalized timespec format in the variable pointed to by @ts.
+ */
+void ktime_get_ts(struct timespec *ts)
+{
+ struct timespec tomono;
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ getnstimeofday(ts);
+ tomono = wall_to_monotonic;
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
+ ts->tv_nsec + tomono.tv_nsec);
+}
+EXPORT_SYMBOL_GPL(ktime_get_ts);
+#endif /* !GENERIC_TIME */
+
+/**
+ * ktime_get_real - get the real (wall-) time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+ktime_t ktime_get_real(void)
+{
+ struct timespec now;
+
+ getnstimeofday(&now);
+
+ return timespec_to_ktime(now);
+}
+EXPORT_SYMBOL_GPL(ktime_get_real);
/**
* getrawmonotonic - Returns the raw monotonic time in a timespec
diff --git a/kernel/timer.c b/kernel/timer.c
index a7f07d5a6241..33fc9d175f40 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -72,6 +72,7 @@ struct tvec_base {
spinlock_t lock;
struct timer_list *running_timer;
unsigned long timer_jiffies;
+ unsigned long next_timer;
struct tvec_root tv1;
struct tvec tv2;
struct tvec tv3;
@@ -622,6 +623,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
if (timer_pending(timer)) {
detach_timer(timer, 0);
+ if (timer->expires == base->next_timer &&
+ !tbase_get_deferrable(timer->base))
+ base->next_timer = base->timer_jiffies;
ret = 1;
} else {
if (pending_only)
@@ -663,6 +667,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
}
timer->expires = expires;
+ if (time_before(timer->expires, base->next_timer) &&
+ !tbase_get_deferrable(timer->base))
+ base->next_timer = timer->expires;
internal_add_timer(base, timer);
out_unlock:
@@ -781,6 +788,9 @@ void add_timer_on(struct timer_list *timer, int cpu)
spin_lock_irqsave(&base->lock, flags);
timer_set_base(timer, base);
debug_timer_activate(timer);
+ if (time_before(timer->expires, base->next_timer) &&
+ !tbase_get_deferrable(timer->base))
+ base->next_timer = timer->expires;
internal_add_timer(base, timer);
/*
* Check whether the other CPU is idle and needs to be
@@ -817,6 +827,9 @@ int del_timer(struct timer_list *timer)
base = lock_timer_base(timer, &flags);
if (timer_pending(timer)) {
detach_timer(timer, 1);
+ if (timer->expires == base->next_timer &&
+ !tbase_get_deferrable(timer->base))
+ base->next_timer = base->timer_jiffies;
ret = 1;
}
spin_unlock_irqrestore(&base->lock, flags);
@@ -850,6 +863,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
ret = 0;
if (timer_pending(timer)) {
detach_timer(timer, 1);
+ if (timer->expires == base->next_timer &&
+ !tbase_get_deferrable(timer->base))
+ base->next_timer = base->timer_jiffies;
ret = 1;
}
out:
@@ -1134,7 +1150,9 @@ unsigned long get_next_timer_interrupt(unsigned long now)
unsigned long expires;
spin_lock(&base->lock);
- expires = __next_timer_interrupt(base);
+ if (time_before_eq(base->next_timer, base->timer_jiffies))
+ base->next_timer = __next_timer_interrupt(base);
+ expires = base->next_timer;
spin_unlock(&base->lock);
if (time_before_eq(expires, now))
@@ -1523,6 +1541,7 @@ static int __cpuinit init_timers_cpu(int cpu)
INIT_LIST_HEAD(base->tv1.vec + j);
base->timer_jiffies = jiffies;
+ base->next_timer = base->timer_jiffies;
return 0;
}
@@ -1535,6 +1554,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
timer = list_first_entry(head, struct timer_list, entry);
detach_timer(timer, 0);
timer_set_base(timer, new_base);
+ if (time_before(timer->expires, new_base->next_timer) &&
+ !tbase_get_deferrable(timer->base))
+ new_base->next_timer = timer->expires;
internal_add_timer(new_base, timer);
}
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e1d23c26308..1993b7186cdb 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1016,71 +1016,35 @@ static int
__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
{
unsigned long ftrace_addr;
- unsigned long ip, fl;
+ unsigned long flag = 0UL;
ftrace_addr = (unsigned long)FTRACE_ADDR;
- ip = rec->ip;
-
/*
- * If this record is not to be traced and
- * it is not enabled then do nothing.
+ * If this record is not to be traced or we want to disable it,
+ * then disable it.
*
- * If this record is not to be traced and
- * it is enabled then disable it.
+ * If we want to enable it and filtering is off, then enable it.
*
+ * If we want to enable it and filtering is on, enable it only if
+ * it's filtered
*/
- if (rec->flags & FTRACE_FL_NOTRACE) {
- if (rec->flags & FTRACE_FL_ENABLED)
- rec->flags &= ~FTRACE_FL_ENABLED;
- else
- return 0;
-
- } else if (ftrace_filtered && enable) {
- /*
- * Filtering is on:
- */
-
- fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
-
- /* Record is filtered and enabled, do nothing */
- if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
- return 0;
-
- /* Record is not filtered or enabled, do nothing */
- if (!fl)
- return 0;
-
- /* Record is not filtered but enabled, disable it */
- if (fl == FTRACE_FL_ENABLED)
- rec->flags &= ~FTRACE_FL_ENABLED;
- else
- /* Otherwise record is filtered but not enabled, enable it */
- rec->flags |= FTRACE_FL_ENABLED;
- } else {
- /* Disable or not filtered */
-
- if (enable) {
- /* if record is enabled, do nothing */
- if (rec->flags & FTRACE_FL_ENABLED)
- return 0;
-
- rec->flags |= FTRACE_FL_ENABLED;
-
- } else {
+ if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) {
+ if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER))
+ flag = FTRACE_FL_ENABLED;
+ }
- /* if record is not enabled, do nothing */
- if (!(rec->flags & FTRACE_FL_ENABLED))
- return 0;
+ /* If the state of this record hasn't changed, then do nothing */
+ if ((rec->flags & FTRACE_FL_ENABLED) == flag)
+ return 0;
- rec->flags &= ~FTRACE_FL_ENABLED;
- }
+ if (flag) {
+ rec->flags |= FTRACE_FL_ENABLED;
+ return ftrace_make_call(rec, ftrace_addr);
}
- if (rec->flags & FTRACE_FL_ENABLED)
- return ftrace_make_call(rec, ftrace_addr);
- else
- return ftrace_make_nop(NULL, rec, ftrace_addr);
+ rec->flags &= ~FTRACE_FL_ENABLED;
+ return ftrace_make_nop(NULL, rec, ftrace_addr);
}
static void ftrace_replace_code(int enable)
@@ -1375,7 +1339,6 @@ struct ftrace_iterator {
unsigned flags;
unsigned char buffer[FTRACE_BUFF_MAX+1];
unsigned buffer_idx;
- unsigned filtered;
};
static void *
@@ -1438,18 +1401,13 @@ static int t_hash_show(struct seq_file *m, void *v)
{
struct ftrace_func_probe *rec;
struct hlist_node *hnd = v;
- char str[KSYM_SYMBOL_LEN];
rec = hlist_entry(hnd, struct ftrace_func_probe, node);
if (rec->ops->print)
return rec->ops->print(m, rec->ip, rec->ops, rec->data);
- kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
- seq_printf(m, "%s:", str);
-
- kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
- seq_printf(m, "%s", str);
+ seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func);
if (rec->data)
seq_printf(m, ":%p", rec->data);
@@ -1547,7 +1505,6 @@ static int t_show(struct seq_file *m, void *v)
{
struct ftrace_iterator *iter = m->private;
struct dyn_ftrace *rec = v;
- char str[KSYM_SYMBOL_LEN];
if (iter->flags & FTRACE_ITER_HASH)
return t_hash_show(m, v);
@@ -1560,9 +1517,7 @@ static int t_show(struct seq_file *m, void *v)
if (!rec)
return 0;
- kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-
- seq_printf(m, "%s\n", str);
+ seq_printf(m, "%pf\n", (void *)rec->ip);
return 0;
}
@@ -1601,17 +1556,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
return ret;
}
-int ftrace_avail_release(struct inode *inode, struct file *file)
-{
- struct seq_file *m = (struct seq_file *)file->private_data;
- struct ftrace_iterator *iter = m->private;
-
- seq_release(inode, file);
- kfree(iter);
-
- return 0;
-}
-
static int
ftrace_failures_open(struct inode *inode, struct file *file)
{
@@ -2312,7 +2256,6 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
}
if (isspace(ch)) {
- iter->filtered++;
iter->buffer[iter->buffer_idx] = 0;
ret = ftrace_process_regex(iter->buffer,
iter->buffer_idx, enable);
@@ -2443,7 +2386,6 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
iter = file->private_data;
if (iter->buffer_idx) {
- iter->filtered++;
iter->buffer[iter->buffer_idx] = 0;
ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
}
@@ -2474,14 +2416,14 @@ static const struct file_operations ftrace_avail_fops = {
.open = ftrace_avail_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = ftrace_avail_release,
+ .release = seq_release_private,
};
static const struct file_operations ftrace_failures_fops = {
.open = ftrace_failures_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = ftrace_avail_release,
+ .release = seq_release_private,
};
static const struct file_operations ftrace_filter_fops = {
@@ -2543,7 +2485,6 @@ static void g_stop(struct seq_file *m, void *p)
static int g_show(struct seq_file *m, void *v)
{
unsigned long *ptr = v;
- char str[KSYM_SYMBOL_LEN];
if (!ptr)
return 0;
@@ -2553,9 +2494,7 @@ static int g_show(struct seq_file *m, void *v)
return 0;
}
- kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
-
- seq_printf(m, "%s\n", str);
+ seq_printf(m, "%pf\n", v);
return 0;
}
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 1edaa9516e81..dda53ccf749b 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -239,12 +239,52 @@ struct kmemtrace_user_event_alloc {
};
static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter,
- struct kmemtrace_alloc_entry *entry)
+kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
{
- struct kmemtrace_user_event_alloc *ev_alloc;
struct trace_seq *s = &iter->seq;
+ struct kmemtrace_alloc_entry *entry;
+ int ret;
+
+ trace_assign_type(entry, iter->ent);
+
+ ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
+ "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
+ entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
+ (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
+ (unsigned long)entry->gfp_flags, entry->node);
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free(struct trace_iterator *iter, int flags)
+{
+ struct trace_seq *s = &iter->seq;
+ struct kmemtrace_free_entry *entry;
+ int ret;
+
+ trace_assign_type(entry, iter->ent);
+
+ ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
+ entry->type_id, (void *)entry->call_site,
+ (unsigned long)entry->ptr);
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
+{
+ struct trace_seq *s = &iter->seq;
+ struct kmemtrace_alloc_entry *entry;
struct kmemtrace_user_event *ev;
+ struct kmemtrace_user_event_alloc *ev_alloc;
+
+ trace_assign_type(entry, iter->ent);
ev = trace_seq_reserve(s, sizeof(*ev));
if (!ev)
@@ -271,12 +311,14 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter,
}
static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter,
- struct kmemtrace_free_entry *entry)
+kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
{
struct trace_seq *s = &iter->seq;
+ struct kmemtrace_free_entry *entry;
struct kmemtrace_user_event *ev;
+ trace_assign_type(entry, iter->ent);
+
ev = trace_seq_reserve(s, sizeof(*ev));
if (!ev)
return TRACE_TYPE_PARTIAL_LINE;
@@ -294,12 +336,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter,
/* The two other following provide a more minimalistic output */
static enum print_line_t
-kmemtrace_print_alloc_compress(struct trace_iterator *iter,
- struct kmemtrace_alloc_entry *entry)
+kmemtrace_print_alloc_compress(struct trace_iterator *iter)
{
+ struct kmemtrace_alloc_entry *entry;
struct trace_seq *s = &iter->seq;
int ret;
+ trace_assign_type(entry, iter->ent);
+
/* Alloc entry */
ret = trace_seq_printf(s, " + ");
if (!ret)
@@ -345,29 +389,24 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- /* Node */
- ret = trace_seq_printf(s, "%4d ", entry->node);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Call site */
- ret = seq_print_ip_sym(s, entry->call_site, 0);
+ /* Node and call site*/
+ ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
+ (void *)entry->call_site);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- if (!trace_seq_printf(s, "\n"))
- return TRACE_TYPE_PARTIAL_LINE;
-
return TRACE_TYPE_HANDLED;
}
static enum print_line_t
-kmemtrace_print_free_compress(struct trace_iterator *iter,
- struct kmemtrace_free_entry *entry)
+kmemtrace_print_free_compress(struct trace_iterator *iter)
{
+ struct kmemtrace_free_entry *entry;
struct trace_seq *s = &iter->seq;
int ret;
+ trace_assign_type(entry, iter->ent);
+
/* Free entry */
ret = trace_seq_printf(s, " - ");
if (!ret)
@@ -401,19 +440,11 @@ kmemtrace_print_free_compress(struct trace_iterator *iter,
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- /* Skip node */
- ret = trace_seq_printf(s, " ");
+ /* Skip node and print call site*/
+ ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- /* Call site */
- ret = seq_print_ip_sym(s, entry->call_site, 0);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- if (!trace_seq_printf(s, "\n"))
- return TRACE_TYPE_PARTIAL_LINE;
-
return TRACE_TYPE_HANDLED;
}
@@ -421,32 +452,31 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
{
struct trace_entry *entry = iter->ent;
- switch (entry->type) {
- case TRACE_KMEM_ALLOC: {
- struct kmemtrace_alloc_entry *field;
-
- trace_assign_type(field, entry);
- if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
- return kmemtrace_print_alloc_compress(iter, field);
- else
- return kmemtrace_print_alloc_user(iter, field);
- }
-
- case TRACE_KMEM_FREE: {
- struct kmemtrace_free_entry *field;
-
- trace_assign_type(field, entry);
- if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
- return kmemtrace_print_free_compress(iter, field);
- else
- return kmemtrace_print_free_user(iter, field);
- }
+ if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
+ return TRACE_TYPE_UNHANDLED;
+ switch (entry->type) {
+ case TRACE_KMEM_ALLOC:
+ return kmemtrace_print_alloc_compress(iter);
+ case TRACE_KMEM_FREE:
+ return kmemtrace_print_free_compress(iter);
default:
return TRACE_TYPE_UNHANDLED;
}
}
+static struct trace_event kmem_trace_alloc = {
+ .type = TRACE_KMEM_ALLOC,
+ .trace = kmemtrace_print_alloc,
+ .binary = kmemtrace_print_alloc_user,
+};
+
+static struct trace_event kmem_trace_free = {
+ .type = TRACE_KMEM_FREE,
+ .trace = kmemtrace_print_free,
+ .binary = kmemtrace_print_free_user,
+};
+
static struct tracer kmem_tracer __read_mostly = {
.name = "kmemtrace",
.init = kmem_trace_init,
@@ -463,6 +493,21 @@ void kmemtrace_init(void)
static int __init init_kmem_tracer(void)
{
- return register_tracer(&kmem_tracer);
+ if (!register_ftrace_event(&kmem_trace_alloc)) {
+ pr_warning("Warning: could not register kmem events\n");
+ return 1;
+ }
+
+ if (!register_ftrace_event(&kmem_trace_free)) {
+ pr_warning("Warning: could not register kmem events\n");
+ return 1;
+ }
+
+ if (!register_tracer(&kmem_tracer)) {
+ pr_warning("Warning: could not register the kmem tracer\n");
+ return 1;
+ }
+
+ return 0;
}
device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a330513d96ce..da2c59d8f486 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -322,6 +322,14 @@ struct buffer_data_page {
unsigned char data[]; /* data of buffer page */
};
+/*
+ * Note, the buffer_page list must be first. The buffer pages
+ * are allocated in cache lines, which means that each buffer
+ * page will be at the beginning of a cache line, and thus
+ * the least significant bits will be zero. We use this to
+ * add flags in the list struct pointers, to make the ring buffer
+ * lockless.
+ */
struct buffer_page {
struct list_head list; /* list of buffer pages */
local_t write; /* index for next write */
@@ -330,6 +338,21 @@ struct buffer_page {
struct buffer_data_page *page; /* Actual data page */
};
+/*
+ * The buffer page counters, write and entries, must be reset
+ * atomically when crossing page boundaries. To synchronize this
+ * update, two counters are inserted into the number. One is
+ * the actual counter for the write position or count on the page.
+ *
+ * The other is a counter of updaters. Before an update happens
+ * the update partition of the counter is incremented. This will
+ * allow the updater to update the counter atomically.
+ *
+ * The counter is 20 bits, and the state data is 12.
+ */
+#define RB_WRITE_MASK 0xfffff
+#define RB_WRITE_INTCNT (1 << 20)
+
static void rb_init_page(struct buffer_data_page *bpage)
{
local_set(&bpage->commit, 0);
@@ -403,21 +426,20 @@ int ring_buffer_print_page_header(struct trace_seq *s)
struct ring_buffer_per_cpu {
int cpu;
struct ring_buffer *buffer;
- spinlock_t reader_lock; /* serialize readers */
+ spinlock_t reader_lock; /* serialize readers */
raw_spinlock_t lock;
struct lock_class_key lock_key;
- struct list_head pages;
+ struct list_head *pages;
struct buffer_page *head_page; /* read from head */
struct buffer_page *tail_page; /* write to tail */
struct buffer_page *commit_page; /* committed pages */
struct buffer_page *reader_page;
- unsigned long nmi_dropped;
- unsigned long commit_overrun;
- unsigned long overrun;
- unsigned long read;
+ local_t commit_overrun;
+ local_t overrun;
local_t entries;
local_t committing;
local_t commits;
+ unsigned long read;
u64 write_stamp;
u64 read_stamp;
atomic_t record_disabled;
@@ -489,6 +511,390 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
+/*
+ * Making the ring buffer lockless makes things tricky.
+ * Although writes only happen on the CPU that they are on,
+ * and they only need to worry about interrupts. Reads can
+ * happen on any CPU.
+ *
+ * The reader page is always off the ring buffer, but when the
+ * reader finishes with a page, it needs to swap its page with
+ * a new one from the buffer. The reader needs to take from
+ * the head (writes go to the tail). But if a writer is in overwrite
+ * mode and wraps, it must push the head page forward.
+ *
+ * Here lies the problem.
+ *
+ * The reader must be careful to replace only the head page, and
+ * not another one. As described at the top of the file in the
+ * ASCII art, the reader sets its old page to point to the next
+ * page after head. It then sets the page after head to point to
+ * the old reader page. But if the writer moves the head page
+ * during this operation, the reader could end up with the tail.
+ *
+ * We use cmpxchg to help prevent this race. We also do something
+ * special with the page before head. We set the LSB to 1.
+ *
+ * When the writer must push the page forward, it will clear the
+ * bit that points to the head page, move the head, and then set
+ * the bit that points to the new head page.
+ *
+ * We also don't want an interrupt coming in and moving the head
+ * page on another writer. Thus we use the second LSB to catch
+ * that too. Thus:
+ *
+ * head->list->prev->next bit 1 bit 0
+ * ------- -------
+ * Normal page 0 0
+ * Points to head page 0 1
+ * New head page 1 0
+ *
+ * Note we can not trust the prev pointer of the head page, because:
+ *
+ * +----+ +-----+ +-----+
+ * | |------>| T |---X--->| N |
+ * | |<------| | | |
+ * +----+ +-----+ +-----+
+ * ^ ^ |
+ * | +-----+ | |
+ * +----------| R |----------+ |
+ * | |<-----------+
+ * +-----+
+ *
+ * Key: ---X--> HEAD flag set in pointer
+ * T Tail page
+ * R Reader page
+ * N Next page
+ *
+ * (see __rb_reserve_next() to see where this happens)
+ *
+ * What the above shows is that the reader just swapped out
+ * the reader page with a page in the buffer, but before it
+ * could make the new header point back to the new page added
+ * it was preempted by a writer. The writer moved forward onto
+ * the new page added by the reader and is about to move forward
+ * again.
+ *
+ * You can see, it is legitimate for the previous pointer of
+ * the head (or any page) not to point back to itself. But only
+ * temporarially.
+ */
+
+#define RB_PAGE_NORMAL 0UL
+#define RB_PAGE_HEAD 1UL
+#define RB_PAGE_UPDATE 2UL
+
+
+#define RB_FLAG_MASK 3UL
+
+/* PAGE_MOVED is not part of the mask */
+#define RB_PAGE_MOVED 4UL
+
+/*
+ * rb_list_head - remove any bit
+ */
+static struct list_head *rb_list_head(struct list_head *list)
+{
+ unsigned long val = (unsigned long)list;
+
+ return (struct list_head *)(val & ~RB_FLAG_MASK);
+}
+
+/*
+ * rb_is_head_page - test if the give page is the head page
+ *
+ * Because the reader may move the head_page pointer, we can
+ * not trust what the head page is (it may be pointing to
+ * the reader page). But if the next page is a header page,
+ * its flags will be non zero.
+ */
+static int inline
+rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *page, struct list_head *list)
+{
+ unsigned long val;
+
+ val = (unsigned long)list->next;
+
+ if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
+ return RB_PAGE_MOVED;
+
+ return val & RB_FLAG_MASK;
+}
+
+/*
+ * rb_is_reader_page
+ *
+ * The unique thing about the reader page, is that, if the
+ * writer is ever on it, the previous pointer never points
+ * back to the reader page.
+ */
+static int rb_is_reader_page(struct buffer_page *page)
+{
+ struct list_head *list = page->list.prev;
+
+ return rb_list_head(list->next) != &page->list;
+}
+
+/*
+ * rb_set_list_to_head - set a list_head to be pointing to head.
+ */
+static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *list)
+{
+ unsigned long *ptr;
+
+ ptr = (unsigned long *)&list->next;
+ *ptr |= RB_PAGE_HEAD;
+ *ptr &= ~RB_PAGE_UPDATE;
+}
+
+/*
+ * rb_head_page_activate - sets up head page
+ */
+static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *head;
+
+ head = cpu_buffer->head_page;
+ if (!head)
+ return;
+
+ /*
+ * Set the previous list pointer to have the HEAD flag.
+ */
+ rb_set_list_to_head(cpu_buffer, head->list.prev);
+}
+
+static void rb_list_head_clear(struct list_head *list)
+{
+ unsigned long *ptr = (unsigned long *)&list->next;
+
+ *ptr &= ~RB_FLAG_MASK;
+}
+
+/*
+ * rb_head_page_dactivate - clears head page ptr (for free list)
+ */
+static void
+rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *hd;
+
+ /* Go through the whole list and clear any pointers found. */
+ rb_list_head_clear(cpu_buffer->pages);
+
+ list_for_each(hd, cpu_buffer->pages)
+ rb_list_head_clear(hd);
+}
+
+static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag, int new_flag)
+{
+ struct list_head *list;
+ unsigned long val = (unsigned long)&head->list;
+ unsigned long ret;
+
+ list = &prev->list;
+
+ val &= ~RB_FLAG_MASK;
+
+ ret = (unsigned long)cmpxchg(&list->next,
+ val | old_flag, val | new_flag);
+
+ /* check if the reader took the page */
+ if ((ret & ~RB_FLAG_MASK) != val)
+ return RB_PAGE_MOVED;
+
+ return ret & RB_FLAG_MASK;
+}
+
+static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag)
+{
+ return rb_head_page_set(cpu_buffer, head, prev,
+ old_flag, RB_PAGE_UPDATE);
+}
+
+static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag)
+{
+ return rb_head_page_set(cpu_buffer, head, prev,
+ old_flag, RB_PAGE_HEAD);
+}
+
+static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag)
+{
+ return rb_head_page_set(cpu_buffer, head, prev,
+ old_flag, RB_PAGE_NORMAL);
+}
+
+static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page **bpage)
+{
+ struct list_head *p = rb_list_head((*bpage)->list.next);
+
+ *bpage = list_entry(p, struct buffer_page, list);
+}
+
+static struct buffer_page *
+rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *head;
+ struct buffer_page *page;
+ struct list_head *list;
+ int i;
+
+ if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
+ return NULL;
+
+ /* sanity check */
+ list = cpu_buffer->pages;
+ if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
+ return NULL;
+
+ page = head = cpu_buffer->head_page;
+ /*
+ * It is possible that the writer moves the header behind
+ * where we started, and we miss in one loop.
+ * A second loop should grab the header, but we'll do
+ * three loops just because I'm paranoid.
+ */
+ for (i = 0; i < 3; i++) {
+ do {
+ if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
+ cpu_buffer->head_page = page;
+ return page;
+ }
+ rb_inc_page(cpu_buffer, &page);
+ } while (page != head);
+ }
+
+ RB_WARN_ON(cpu_buffer, 1);
+
+ return NULL;
+}
+
+static int rb_head_page_replace(struct buffer_page *old,
+ struct buffer_page *new)
+{
+ unsigned long *ptr = (unsigned long *)&old->list.prev->next;
+ unsigned long val;
+ unsigned long ret;
+
+ val = *ptr & ~RB_FLAG_MASK;
+ val |= RB_PAGE_HEAD;
+
+ ret = cmpxchg(ptr, val, &new->list);
+
+ return ret == val;
+}
+
+/*
+ * rb_tail_page_update - move the tail page forward
+ *
+ * Returns 1 if moved tail page, 0 if someone else did.
+ */
+static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *tail_page,
+ struct buffer_page *next_page)
+{
+ struct buffer_page *old_tail;
+ unsigned long old_entries;
+ unsigned long old_write;
+ int ret = 0;
+
+ /*
+ * The tail page now needs to be moved forward.
+ *
+ * We need to reset the tail page, but without messing
+ * with possible erasing of data brought in by interrupts
+ * that have moved the tail page and are currently on it.
+ *
+ * We add a counter to the write field to denote this.
+ */
+ old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
+ old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
+
+ /*
+ * Just make sure we have seen our old_write and synchronize
+ * with any interrupts that come in.
+ */
+ barrier();
+
+ /*
+ * If the tail page is still the same as what we think
+ * it is, then it is up to us to update the tail
+ * pointer.
+ */
+ if (tail_page == cpu_buffer->tail_page) {
+ /* Zero the write counter */
+ unsigned long val = old_write & ~RB_WRITE_MASK;
+ unsigned long eval = old_entries & ~RB_WRITE_MASK;
+
+ /*
+ * This will only succeed if an interrupt did
+ * not come in and change it. In which case, we
+ * do not want to modify it.
+ *
+ * We add (void) to let the compiler know that we do not care
+ * about the return value of these functions. We use the
+ * cmpxchg to only update if an interrupt did not already
+ * do it for us. If the cmpxchg fails, we don't care.
+ */
+ (void)local_cmpxchg(&next_page->write, old_write, val);
+ (void)local_cmpxchg(&next_page->entries, old_entries, eval);
+
+ /*
+ * No need to worry about races with clearing out the commit.
+ * it only can increment when a commit takes place. But that
+ * only happens in the outer most nested commit.
+ */
+ local_set(&next_page->page->commit, 0);
+
+ old_tail = cmpxchg(&cpu_buffer->tail_page,
+ tail_page, next_page);
+
+ if (old_tail == tail_page)
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *bpage)
+{
+ unsigned long val = (unsigned long)bpage;
+
+ if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * rb_check_list - make sure a pointer to a list has the last bits zero
+ */
+static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *list)
+{
+ if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
+ return 1;
+ if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
+ return 1;
+ return 0;
+}
+
/**
* check_pages - integrity check of buffer pages
* @cpu_buffer: CPU buffer with pages to test
@@ -498,14 +904,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
*/
static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct list_head *head = &cpu_buffer->pages;
+ struct list_head *head = cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
+ rb_head_page_deactivate(cpu_buffer);
+
if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
return -1;
if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
return -1;
+ if (rb_check_list(cpu_buffer, head))
+ return -1;
+
list_for_each_entry_safe(bpage, tmp, head, list) {
if (RB_WARN_ON(cpu_buffer,
bpage->list.next->prev != &bpage->list))
@@ -513,25 +924,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
if (RB_WARN_ON(cpu_buffer,
bpage->list.prev->next != &bpage->list))
return -1;
+ if (rb_check_list(cpu_buffer, &bpage->list))
+ return -1;
}
+ rb_head_page_activate(cpu_buffer);
+
return 0;
}
static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
unsigned nr_pages)
{
- struct list_head *head = &cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
unsigned long addr;
LIST_HEAD(pages);
unsigned i;
+ WARN_ON(!nr_pages);
+
for (i = 0; i < nr_pages; i++) {
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
if (!bpage)
goto free_pages;
+
+ rb_check_bpage(cpu_buffer, bpage);
+
list_add(&bpage->list, &pages);
addr = __get_free_page(GFP_KERNEL);
@@ -541,7 +960,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_init_page(bpage->page);
}
- list_splice(&pages, head);
+ /*
+ * The ring buffer page list is a circular list that does not
+ * start and end with a list head. All page list items point to
+ * other pages.
+ */
+ cpu_buffer->pages = pages.next;
+ list_del(&pages);
rb_check_pages(cpu_buffer);
@@ -573,13 +998,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
spin_lock_init(&cpu_buffer->reader_lock);
lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
- INIT_LIST_HEAD(&cpu_buffer->pages);
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
if (!bpage)
goto fail_free_buffer;
+ rb_check_bpage(cpu_buffer, bpage);
+
cpu_buffer->reader_page = bpage;
addr = __get_free_page(GFP_KERNEL);
if (!addr)
@@ -594,9 +1020,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
goto fail_free_reader;
cpu_buffer->head_page
- = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+ rb_head_page_activate(cpu_buffer);
+
return cpu_buffer;
fail_free_reader:
@@ -609,15 +1037,22 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct list_head *head = &cpu_buffer->pages;
+ struct list_head *head = cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
free_buffer_page(cpu_buffer->reader_page);
- list_for_each_entry_safe(bpage, tmp, head, list) {
- list_del_init(&bpage->list);
+ rb_head_page_deactivate(cpu_buffer);
+
+ if (head) {
+ list_for_each_entry_safe(bpage, tmp, head, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ bpage = list_entry(head, struct buffer_page, list);
free_buffer_page(bpage);
}
+
kfree(cpu_buffer);
}
@@ -760,15 +1195,17 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
atomic_inc(&cpu_buffer->record_disabled);
synchronize_sched();
+ rb_head_page_deactivate(cpu_buffer);
+
for (i = 0; i < nr_pages; i++) {
- if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
+ if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
return;
- p = cpu_buffer->pages.next;
+ p = cpu_buffer->pages->next;
bpage = list_entry(p, struct buffer_page, list);
list_del_init(&bpage->list);
free_buffer_page(bpage);
}
- if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
+ if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
return;
rb_reset_cpu(cpu_buffer);
@@ -790,15 +1227,19 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
atomic_inc(&cpu_buffer->record_disabled);
synchronize_sched();
+ spin_lock_irq(&cpu_buffer->reader_lock);
+ rb_head_page_deactivate(cpu_buffer);
+
for (i = 0; i < nr_pages; i++) {
if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
return;
p = pages->next;
bpage = list_entry(p, struct buffer_page, list);
list_del_init(&bpage->list);
- list_add_tail(&bpage->list, &cpu_buffer->pages);
+ list_add_tail(&bpage->list, cpu_buffer->pages);
}
rb_reset_cpu(cpu_buffer);
+ spin_unlock_irq(&cpu_buffer->reader_lock);
rb_check_pages(cpu_buffer);
@@ -949,21 +1390,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
}
static inline struct ring_buffer_event *
-rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
-{
- return __rb_page_index(cpu_buffer->head_page,
- cpu_buffer->head_page->read);
-}
-
-static inline struct ring_buffer_event *
rb_iter_head_event(struct ring_buffer_iter *iter)
{
return __rb_page_index(iter->head_page, iter->head);
}
-static inline unsigned rb_page_write(struct buffer_page *bpage)
+static inline unsigned long rb_page_write(struct buffer_page *bpage)
{
- return local_read(&bpage->write);
+ return local_read(&bpage->write) & RB_WRITE_MASK;
}
static inline unsigned rb_page_commit(struct buffer_page *bpage)
@@ -971,6 +1405,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage)
return local_read(&bpage->page->commit);
}
+static inline unsigned long rb_page_entries(struct buffer_page *bpage)
+{
+ return local_read(&bpage->entries) & RB_WRITE_MASK;
+}
+
/* Size is determined by what has been commited */
static inline unsigned rb_page_size(struct buffer_page *bpage)
{
@@ -983,22 +1422,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
return rb_page_commit(cpu_buffer->commit_page);
}
-static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
-{
- return rb_page_commit(cpu_buffer->head_page);
-}
-
-static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_page **bpage)
-{
- struct list_head *p = (*bpage)->list.next;
-
- if (p == &cpu_buffer->pages)
- p = p->next;
-
- *bpage = list_entry(p, struct buffer_page, list);
-}
-
static inline unsigned
rb_event_index(struct ring_buffer_event *event)
{
@@ -1024,6 +1447,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
static void
rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
{
+ unsigned long max_count;
+
/*
* We only race with interrupts and NMIs on this CPU.
* If we own the commit event, then we can commit
@@ -1033,9 +1458,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
* assign the commit to the tail.
*/
again:
+ max_count = cpu_buffer->buffer->pages * 100;
+
while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
- cpu_buffer->commit_page->page->commit =
- cpu_buffer->commit_page->write;
+ if (RB_WARN_ON(cpu_buffer, !(--max_count)))
+ return;
+ if (RB_WARN_ON(cpu_buffer,
+ rb_is_reader_page(cpu_buffer->tail_page)))
+ return;
+ local_set(&cpu_buffer->commit_page->page->commit,
+ rb_page_write(cpu_buffer->commit_page));
rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
cpu_buffer->write_stamp =
cpu_buffer->commit_page->page->time_stamp;
@@ -1044,8 +1476,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
}
while (rb_commit_index(cpu_buffer) !=
rb_page_write(cpu_buffer->commit_page)) {
- cpu_buffer->commit_page->page->commit =
- cpu_buffer->commit_page->write;
+
+ local_set(&cpu_buffer->commit_page->page->commit,
+ rb_page_write(cpu_buffer->commit_page));
+ RB_WARN_ON(cpu_buffer,
+ local_read(&cpu_buffer->commit_page->page->commit) &
+ ~RB_WRITE_MASK);
barrier();
}
@@ -1078,7 +1514,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
* to the head page instead of next.
*/
if (iter->head_page == cpu_buffer->reader_page)
- iter->head_page = cpu_buffer->head_page;
+ iter->head_page = rb_set_head_page(cpu_buffer);
else
rb_inc_page(cpu_buffer, &iter->head_page);
@@ -1122,6 +1558,163 @@ rb_update_event(struct ring_buffer_event *event,
}
}
+/*
+ * rb_handle_head_page - writer hit the head page
+ *
+ * Returns: +1 to retry page
+ * 0 to continue
+ * -1 on error
+ */
+static int
+rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *tail_page,
+ struct buffer_page *next_page)
+{
+ struct buffer_page *new_head;
+ int entries;
+ int type;
+ int ret;
+
+ entries = rb_page_entries(next_page);
+
+ /*
+ * The hard part is here. We need to move the head
+ * forward, and protect against both readers on
+ * other CPUs and writers coming in via interrupts.
+ */
+ type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
+ RB_PAGE_HEAD);
+
+ /*
+ * type can be one of four:
+ * NORMAL - an interrupt already moved it for us
+ * HEAD - we are the first to get here.
+ * UPDATE - we are the interrupt interrupting
+ * a current move.
+ * MOVED - a reader on another CPU moved the next
+ * pointer to its reader page. Give up
+ * and try again.
+ */
+
+ switch (type) {
+ case RB_PAGE_HEAD:
+ /*
+ * We changed the head to UPDATE, thus
+ * it is our responsibility to update
+ * the counters.
+ */
+ local_add(entries, &cpu_buffer->overrun);
+
+ /*
+ * The entries will be zeroed out when we move the
+ * tail page.
+ */
+
+ /* still more to do */
+ break;
+
+ case RB_PAGE_UPDATE:
+ /*
+ * This is an interrupt that interrupt the
+ * previous update. Still more to do.
+ */
+ break;
+ case RB_PAGE_NORMAL:
+ /*
+ * An interrupt came in before the update
+ * and processed this for us.
+ * Nothing left to do.
+ */
+ return 1;
+ case RB_PAGE_MOVED:
+ /*
+ * The reader is on another CPU and just did
+ * a swap with our next_page.
+ * Try again.
+ */
+ return 1;
+ default:
+ RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
+ return -1;
+ }
+
+ /*
+ * Now that we are here, the old head pointer is
+ * set to UPDATE. This will keep the reader from
+ * swapping the head page with the reader page.
+ * The reader (on another CPU) will spin till
+ * we are finished.
+ *
+ * We just need to protect against interrupts
+ * doing the job. We will set the next pointer
+ * to HEAD. After that, we set the old pointer
+ * to NORMAL, but only if it was HEAD before.
+ * otherwise we are an interrupt, and only
+ * want the outer most commit to reset it.
+ */
+ new_head = next_page;
+ rb_inc_page(cpu_buffer, &new_head);
+
+ ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
+ RB_PAGE_NORMAL);
+
+ /*
+ * Valid returns are:
+ * HEAD - an interrupt came in and already set it.
+ * NORMAL - One of two things:
+ * 1) We really set it.
+ * 2) A bunch of interrupts came in and moved
+ * the page forward again.
+ */
+ switch (ret) {
+ case RB_PAGE_HEAD:
+ case RB_PAGE_NORMAL:
+ /* OK */
+ break;
+ default:
+ RB_WARN_ON(cpu_buffer, 1);
+ return -1;
+ }
+
+ /*
+ * It is possible that an interrupt came in,
+ * set the head up, then more interrupts came in
+ * and moved it again. When we get back here,
+ * the page would have been set to NORMAL but we
+ * just set it back to HEAD.
+ *
+ * How do you detect this? Well, if that happened
+ * the tail page would have moved.
+ */
+ if (ret == RB_PAGE_NORMAL) {
+ /*
+ * If the tail had moved passed next, then we need
+ * to reset the pointer.
+ */
+ if (cpu_buffer->tail_page != tail_page &&
+ cpu_buffer->tail_page != next_page)
+ rb_head_page_set_normal(cpu_buffer, new_head,
+ next_page,
+ RB_PAGE_HEAD);
+ }
+
+ /*
+ * If this was the outer most commit (the one that
+ * changed the original pointer from HEAD to UPDATE),
+ * then it is up to us to reset it to NORMAL.
+ */
+ if (type == RB_PAGE_HEAD) {
+ ret = rb_head_page_set_normal(cpu_buffer, next_page,
+ tail_page,
+ RB_PAGE_UPDATE);
+ if (RB_WARN_ON(cpu_buffer,
+ ret != RB_PAGE_UPDATE))
+ return -1;
+ }
+
+ return 0;
+}
+
static unsigned rb_calculate_event_length(unsigned length)
{
struct ring_buffer_event event; /* Used only for sizeof array */
@@ -1200,96 +1793,93 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *commit_page,
struct buffer_page *tail_page, u64 *ts)
{
- struct buffer_page *next_page, *head_page, *reader_page;
struct ring_buffer *buffer = cpu_buffer->buffer;
- bool lock_taken = false;
- unsigned long flags;
+ struct buffer_page *next_page;
+ int ret;
next_page = tail_page;
- local_irq_save(flags);
- /*
- * Since the write to the buffer is still not
- * fully lockless, we must be careful with NMIs.
- * The locks in the writers are taken when a write
- * crosses to a new page. The locks protect against
- * races with the readers (this will soon be fixed
- * with a lockless solution).
- *
- * Because we can not protect against NMIs, and we
- * want to keep traces reentrant, we need to manage
- * what happens when we are in an NMI.
- *
- * NMIs can happen after we take the lock.
- * If we are in an NMI, only take the lock
- * if it is not already taken. Otherwise
- * simply fail.
- */
- if (unlikely(in_nmi())) {
- if (!__raw_spin_trylock(&cpu_buffer->lock)) {
- cpu_buffer->nmi_dropped++;
- goto out_reset;
- }
- } else
- __raw_spin_lock(&cpu_buffer->lock);
-
- lock_taken = true;
-
rb_inc_page(cpu_buffer, &next_page);
- head_page = cpu_buffer->head_page;
- reader_page = cpu_buffer->reader_page;
-
- /* we grabbed the lock before incrementing */
- if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
- goto out_reset;
-
/*
* If for some reason, we had an interrupt storm that made
* it all the way around the buffer, bail, and warn
* about it.
*/
if (unlikely(next_page == commit_page)) {
- cpu_buffer->commit_overrun++;
+ local_inc(&cpu_buffer->commit_overrun);
goto out_reset;
}
- if (next_page == head_page) {
- if (!(buffer->flags & RB_FL_OVERWRITE))
- goto out_reset;
-
- /* tail_page has not moved yet? */
- if (tail_page == cpu_buffer->tail_page) {
- /* count overflows */
- cpu_buffer->overrun +=
- local_read(&head_page->entries);
+ /*
+ * This is where the fun begins!
+ *
+ * We are fighting against races between a reader that
+ * could be on another CPU trying to swap its reader
+ * page with the buffer head.
+ *
+ * We are also fighting against interrupts coming in and
+ * moving the head or tail on us as well.
+ *
+ * If the next page is the head page then we have filled
+ * the buffer, unless the commit page is still on the
+ * reader page.
+ */
+ if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
- rb_inc_page(cpu_buffer, &head_page);
- cpu_buffer->head_page = head_page;
- cpu_buffer->head_page->read = 0;
+ /*
+ * If the commit is not on the reader page, then
+ * move the header page.
+ */
+ if (!rb_is_reader_page(cpu_buffer->commit_page)) {
+ /*
+ * If we are not in overwrite mode,
+ * this is easy, just stop here.
+ */
+ if (!(buffer->flags & RB_FL_OVERWRITE))
+ goto out_reset;
+
+ ret = rb_handle_head_page(cpu_buffer,
+ tail_page,
+ next_page);
+ if (ret < 0)
+ goto out_reset;
+ if (ret)
+ goto out_again;
+ } else {
+ /*
+ * We need to be careful here too. The
+ * commit page could still be on the reader
+ * page. We could have a small buffer, and
+ * have filled up the buffer with events
+ * from interrupts and such, and wrapped.
+ *
+ * Note, if the tail page is also the on the
+ * reader_page, we let it move out.
+ */
+ if (unlikely((cpu_buffer->commit_page !=
+ cpu_buffer->tail_page) &&
+ (cpu_buffer->commit_page ==
+ cpu_buffer->reader_page))) {
+ local_inc(&cpu_buffer->commit_overrun);
+ goto out_reset;
+ }
}
}
- /*
- * If the tail page is still the same as what we think
- * it is, then it is up to us to update the tail
- * pointer.
- */
- if (tail_page == cpu_buffer->tail_page) {
- local_set(&next_page->write, 0);
- local_set(&next_page->entries, 0);
- local_set(&next_page->page->commit, 0);
- cpu_buffer->tail_page = next_page;
-
- /* reread the time stamp */
+ ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
+ if (ret) {
+ /*
+ * Nested commits always have zero deltas, so
+ * just reread the time stamp
+ */
*ts = rb_time_stamp(buffer, cpu_buffer->cpu);
- cpu_buffer->tail_page->page->time_stamp = *ts;
+ next_page->page->time_stamp = *ts;
}
- rb_reset_tail(cpu_buffer, tail_page, tail, length);
+ out_again:
- __raw_spin_unlock(&cpu_buffer->lock);
- local_irq_restore(flags);
+ rb_reset_tail(cpu_buffer, tail_page, tail, length);
/* fail and let the caller try again */
return ERR_PTR(-EAGAIN);
@@ -1298,9 +1888,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* reset write */
rb_reset_tail(cpu_buffer, tail_page, tail, length);
- if (likely(lock_taken))
- __raw_spin_unlock(&cpu_buffer->lock);
- local_irq_restore(flags);
return NULL;
}
@@ -1317,6 +1904,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
barrier();
tail_page = cpu_buffer->tail_page;
write = local_add_return(length, &tail_page->write);
+
+ /* set write to only the index of the write */
+ write &= RB_WRITE_MASK;
tail = write - length;
/* See if we shot pass the end of this buffer page */
@@ -1361,12 +1951,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
bpage = cpu_buffer->tail_page;
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+ unsigned long write_mask =
+ local_read(&bpage->write) & ~RB_WRITE_MASK;
/*
* This is on the tail page. It is possible that
* a write could come in and move the tail page
* and write to the next page. That is fine
* because we just shorten what is on this page.
*/
+ old_index += write_mask;
+ new_index += write_mask;
index = local_cmpxchg(&bpage->write, old_index, new_index);
if (index == old_index)
return 1;
@@ -1875,9 +2469,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write);
static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = cpu_buffer->reader_page;
- struct buffer_page *head = cpu_buffer->head_page;
+ struct buffer_page *head = rb_set_head_page(cpu_buffer);
struct buffer_page *commit = cpu_buffer->commit_page;
+ /* In case of error, head will be NULL */
+ if (unlikely(!head))
+ return 1;
+
return reader->read == rb_page_commit(reader) &&
(commit == reader ||
(commit == head &&
@@ -1968,7 +2566,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
return 0;
cpu_buffer = buffer->buffers[cpu];
- ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun)
+ ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
- cpu_buffer->read;
return ret;
@@ -1989,33 +2587,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
return 0;
cpu_buffer = buffer->buffers[cpu];
- ret = cpu_buffer->overrun;
+ ret = local_read(&cpu_buffer->overrun);
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
/**
- * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
- * @buffer: The ring buffer
- * @cpu: The per CPU buffer to get the number of overruns from
- */
-unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
-{
- struct ring_buffer_per_cpu *cpu_buffer;
- unsigned long ret;
-
- if (!cpumask_test_cpu(cpu, buffer->cpumask))
- return 0;
-
- cpu_buffer = buffer->buffers[cpu];
- ret = cpu_buffer->nmi_dropped;
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
-
-/**
* ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
* @buffer: The ring buffer
* @cpu: The per CPU buffer to get the number of overruns from
@@ -2030,7 +2608,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
return 0;
cpu_buffer = buffer->buffers[cpu];
- ret = cpu_buffer->commit_overrun;
+ ret = local_read(&cpu_buffer->commit_overrun);
return ret;
}
@@ -2053,7 +2631,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
entries += (local_read(&cpu_buffer->entries) -
- cpu_buffer->overrun) - cpu_buffer->read;
+ local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
}
return entries;
@@ -2076,7 +2654,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
/* if you care about this being correct, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- overruns += cpu_buffer->overrun;
+ overruns += local_read(&cpu_buffer->overrun);
}
return overruns;
@@ -2089,8 +2667,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
/* Iterator usage is expected to have record disabled */
if (list_empty(&cpu_buffer->reader_page->list)) {
- iter->head_page = cpu_buffer->head_page;
- iter->head = cpu_buffer->head_page->read;
+ iter->head_page = rb_set_head_page(cpu_buffer);
+ if (unlikely(!iter->head_page))
+ return;
+ iter->head = iter->head_page->read;
} else {
iter->head_page = cpu_buffer->reader_page;
iter->head = cpu_buffer->reader_page->read;
@@ -2207,6 +2787,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
struct buffer_page *reader = NULL;
unsigned long flags;
int nr_loops = 0;
+ int ret;
local_irq_save(flags);
__raw_spin_lock(&cpu_buffer->lock);
@@ -2240,30 +2821,56 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
goto out;
/*
- * Splice the empty reader page into the list around the head.
* Reset the reader page to size zero.
*/
+ local_set(&cpu_buffer->reader_page->write, 0);
+ local_set(&cpu_buffer->reader_page->entries, 0);
+ local_set(&cpu_buffer->reader_page->page->commit, 0);
- reader = cpu_buffer->head_page;
+ spin:
+ /*
+ * Splice the empty reader page into the list around the head.
+ */
+ reader = rb_set_head_page(cpu_buffer);
cpu_buffer->reader_page->list.next = reader->list.next;
cpu_buffer->reader_page->list.prev = reader->list.prev;
- local_set(&cpu_buffer->reader_page->write, 0);
- local_set(&cpu_buffer->reader_page->entries, 0);
- local_set(&cpu_buffer->reader_page->page->commit, 0);
+ /*
+ * cpu_buffer->pages just needs to point to the buffer, it
+ * has no specific buffer page to point to. Lets move it out
+ * of our way so we don't accidently swap it.
+ */
+ cpu_buffer->pages = reader->list.prev;
- /* Make the reader page now replace the head */
- reader->list.prev->next = &cpu_buffer->reader_page->list;
- reader->list.next->prev = &cpu_buffer->reader_page->list;
+ /* The reader page will be pointing to the new head */
+ rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
+
+ /*
+ * Here's the tricky part.
+ *
+ * We need to move the pointer past the header page.
+ * But we can only do that if a writer is not currently
+ * moving it. The page before the header page has the
+ * flag bit '1' set if it is pointing to the page we want.
+ * but if the writer is in the process of moving it
+ * than it will be '2' or already moved '0'.
+ */
+
+ ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
/*
- * If the tail is on the reader, then we must set the head
- * to the inserted page, otherwise we set it one before.
+ * If we did not convert it, then we must try again.
*/
- cpu_buffer->head_page = cpu_buffer->reader_page;
+ if (!ret)
+ goto spin;
- if (cpu_buffer->commit_page != reader)
- rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ /*
+ * Yeah! We succeeded in replacing the page.
+ *
+ * Now make the new head point back to the reader page.
+ */
+ reader->list.next->prev = &cpu_buffer->reader_page->list;
+ rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
/* Finally update the reader page to the new head */
cpu_buffer->reader_page = reader;
@@ -2717,8 +3324,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_size);
static void
rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
+ rb_head_page_deactivate(cpu_buffer);
+
cpu_buffer->head_page
- = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
local_set(&cpu_buffer->head_page->write, 0);
local_set(&cpu_buffer->head_page->entries, 0);
local_set(&cpu_buffer->head_page->page->commit, 0);
@@ -2734,16 +3343,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->reader_page->page->commit, 0);
cpu_buffer->reader_page->read = 0;
- cpu_buffer->nmi_dropped = 0;
- cpu_buffer->commit_overrun = 0;
- cpu_buffer->overrun = 0;
- cpu_buffer->read = 0;
+ local_set(&cpu_buffer->commit_overrun, 0);
+ local_set(&cpu_buffer->overrun, 0);
local_set(&cpu_buffer->entries, 0);
local_set(&cpu_buffer->committing, 0);
local_set(&cpu_buffer->commits, 0);
+ cpu_buffer->read = 0;
cpu_buffer->write_stamp = 0;
cpu_buffer->read_stamp = 0;
+
+ rb_head_page_activate(cpu_buffer);
}
/**
@@ -3091,7 +3701,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
read = 0;
} else {
/* update the entry counter */
- cpu_buffer->read += local_read(&reader->entries);
+ cpu_buffer->read += rb_page_entries(reader);
/* swap the pages */
rb_init_page(bpage);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6904e381f51c..0b06266ba7fd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -50,7 +50,7 @@ unsigned long __read_mostly tracing_thresh;
* On boot up, the ring buffer is set to the minimum size, so that
* we do not waste memory on systems that are not using tracing.
*/
-static int ring_buffer_expanded;
+int ring_buffer_expanded;
/*
* We need to change this state when a selftest is running.
@@ -64,7 +64,7 @@ static bool __read_mostly tracing_selftest_running;
/*
* If a tracer is running, we do not want to run SELFTEST.
*/
-static bool __read_mostly tracing_selftest_disabled;
+bool __read_mostly tracing_selftest_disabled;
/* For tracers that don't implement custom flags */
static struct tracer_opt dummy_tracer_opt[] = {
@@ -89,7 +89,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
*/
static int tracing_disabled = 1;
-static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
+DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
static inline void ftrace_disable_cpu(void)
{
@@ -867,10 +867,6 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
return event;
}
-static void ftrace_trace_stack(struct trace_array *tr,
- unsigned long flags, int skip, int pc);
-static void ftrace_trace_userstack(struct trace_array *tr,
- unsigned long flags, int pc);
static inline void __trace_buffer_unlock_commit(struct trace_array *tr,
struct ring_buffer_event *event,
@@ -947,54 +943,6 @@ trace_function(struct trace_array *tr,
ring_buffer_unlock_commit(tr->buffer, event);
}
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int __trace_graph_entry(struct trace_array *tr,
- struct ftrace_graph_ent *trace,
- unsigned long flags,
- int pc)
-{
- struct ftrace_event_call *call = &event_funcgraph_entry;
- struct ring_buffer_event *event;
- struct ftrace_graph_ent_entry *entry;
-
- if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
- return 0;
-
- event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
- sizeof(*entry), flags, pc);
- if (!event)
- return 0;
- entry = ring_buffer_event_data(event);
- entry->graph_ent = *trace;
- if (!filter_current_check_discard(call, entry, event))
- ring_buffer_unlock_commit(global_trace.buffer, event);
-
- return 1;
-}
-
-static void __trace_graph_return(struct trace_array *tr,
- struct ftrace_graph_ret *trace,
- unsigned long flags,
- int pc)
-{
- struct ftrace_event_call *call = &event_funcgraph_exit;
- struct ring_buffer_event *event;
- struct ftrace_graph_ret_entry *entry;
-
- if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
- return;
-
- event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
- sizeof(*entry), flags, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->ret = *trace;
- if (!filter_current_check_discard(call, entry, event))
- ring_buffer_unlock_commit(global_trace.buffer, event);
-}
-#endif
-
void
ftrace(struct trace_array *tr, struct trace_array_cpu *data,
unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -1004,11 +952,11 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
trace_function(tr, ip, parent_ip, flags, pc);
}
+#ifdef CONFIG_STACKTRACE
static void __ftrace_trace_stack(struct trace_array *tr,
unsigned long flags,
int skip, int pc)
{
-#ifdef CONFIG_STACKTRACE
struct ftrace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
struct stack_entry *entry;
@@ -1029,12 +977,10 @@ static void __ftrace_trace_stack(struct trace_array *tr,
save_stack_trace(&trace);
if (!filter_check_discard(call, entry, tr->buffer, event))
ring_buffer_unlock_commit(tr->buffer, event);
-#endif
}
-static void ftrace_trace_stack(struct trace_array *tr,
- unsigned long flags,
- int skip, int pc)
+void ftrace_trace_stack(struct trace_array *tr, unsigned long flags, int skip,
+ int pc)
{
if (!(trace_flags & TRACE_ITER_STACKTRACE))
return;
@@ -1042,17 +988,14 @@ static void ftrace_trace_stack(struct trace_array *tr,
__ftrace_trace_stack(tr, flags, skip, pc);
}
-void __trace_stack(struct trace_array *tr,
- unsigned long flags,
- int skip, int pc)
+void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
+ int pc)
{
__ftrace_trace_stack(tr, flags, skip, pc);
}
-static void ftrace_trace_userstack(struct trace_array *tr,
- unsigned long flags, int pc)
+void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc)
{
-#ifdef CONFIG_STACKTRACE
struct ftrace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
@@ -1077,7 +1020,6 @@ static void ftrace_trace_userstack(struct trace_array *tr,
save_stack_trace_user(&trace);
if (!filter_check_discard(call, entry, tr->buffer, event))
ring_buffer_unlock_commit(tr->buffer, event);
-#endif
}
#ifdef UNUSED
@@ -1087,6 +1029,8 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
}
#endif /* UNUSED */
+#endif /* CONFIG_STACKTRACE */
+
static void
ftrace_trace_special(void *__tr,
unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -1115,62 +1059,6 @@ __trace_special(void *__tr, void *__data,
}
void
-tracing_sched_switch_trace(struct trace_array *tr,
- struct task_struct *prev,
- struct task_struct *next,
- unsigned long flags, int pc)
-{
- struct ftrace_event_call *call = &event_context_switch;
- struct ring_buffer_event *event;
- struct ctx_switch_entry *entry;
-
- event = trace_buffer_lock_reserve(tr, TRACE_CTX,
- sizeof(*entry), flags, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->prev_pid = prev->pid;
- entry->prev_prio = prev->prio;
- entry->prev_state = prev->state;
- entry->next_pid = next->pid;
- entry->next_prio = next->prio;
- entry->next_state = next->state;
- entry->next_cpu = task_cpu(next);
-
- if (!filter_check_discard(call, entry, tr->buffer, event))
- trace_buffer_unlock_commit(tr, event, flags, pc);
-}
-
-void
-tracing_sched_wakeup_trace(struct trace_array *tr,
- struct task_struct *wakee,
- struct task_struct *curr,
- unsigned long flags, int pc)
-{
- struct ftrace_event_call *call = &event_wakeup;
- struct ring_buffer_event *event;
- struct ctx_switch_entry *entry;
-
- event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
- sizeof(*entry), flags, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->prev_pid = curr->pid;
- entry->prev_prio = curr->prio;
- entry->prev_state = curr->state;
- entry->next_pid = wakee->pid;
- entry->next_prio = wakee->prio;
- entry->next_state = wakee->state;
- entry->next_cpu = task_cpu(wakee);
-
- if (!filter_check_discard(call, entry, tr->buffer, event))
- ring_buffer_unlock_commit(tr->buffer, event);
- ftrace_trace_stack(tr, flags, 6, pc);
- ftrace_trace_userstack(tr, flags, pc);
-}
-
-void
ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
{
struct trace_array *tr = &global_trace;
@@ -1194,68 +1082,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
local_irq_restore(flags);
}
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-int trace_graph_entry(struct ftrace_graph_ent *trace)
-{
- struct trace_array *tr = &global_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
- long disabled;
- int ret;
- int cpu;
- int pc;
-
- if (!ftrace_trace_task(current))
- return 0;
-
- if (!ftrace_graph_addr(trace->func))
- return 0;
-
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
- if (likely(disabled == 1)) {
- pc = preempt_count();
- ret = __trace_graph_entry(tr, trace, flags, pc);
- } else {
- ret = 0;
- }
- /* Only do the atomic if it is not already set */
- if (!test_tsk_trace_graph(current))
- set_tsk_trace_graph(current);
-
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
-
- return ret;
-}
-
-void trace_graph_return(struct ftrace_graph_ret *trace)
-{
- struct trace_array *tr = &global_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
- long disabled;
- int cpu;
- int pc;
-
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
- if (likely(disabled == 1)) {
- pc = preempt_count();
- __trace_graph_return(tr, trace, flags, pc);
- }
- if (!trace->depth)
- clear_tsk_trace_graph(current);
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
-}
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-
-
/**
* trace_vbprintk - write binary msg to tracing buffer
*
@@ -2255,8 +2081,8 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
len += 3; /* "no" and newline */
}
- /* +2 for \n and \0 */
- buf = kmalloc(len + 2, GFP_KERNEL);
+ /* +1 for \0 */
+ buf = kmalloc(len + 1, GFP_KERNEL);
if (!buf) {
mutex_unlock(&trace_types_lock);
return -ENOMEM;
@@ -2279,7 +2105,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
}
mutex_unlock(&trace_types_lock);
- WARN_ON(r >= len + 2);
+ WARN_ON(r >= len + 1);
r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -2290,23 +2116,23 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
/* Try to assign a tracer specific option */
static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
{
- struct tracer_flags *trace_flags = trace->flags;
+ struct tracer_flags *tracer_flags = trace->flags;
struct tracer_opt *opts = NULL;
int ret = 0, i = 0;
int len;
- for (i = 0; trace_flags->opts[i].name; i++) {
- opts = &trace_flags->opts[i];
+ for (i = 0; tracer_flags->opts[i].name; i++) {
+ opts = &tracer_flags->opts[i];
len = strlen(opts->name);
if (strncmp(cmp, opts->name, len) == 0) {
- ret = trace->set_flag(trace_flags->val,
+ ret = trace->set_flag(tracer_flags->val,
opts->bit, !neg);
break;
}
}
/* Not found */
- if (!trace_flags->opts[i].name)
+ if (!tracer_flags->opts[i].name)
return -EINVAL;
/* Refused to handle */
@@ -2314,9 +2140,9 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
return ret;
if (neg)
- trace_flags->val &= ~opts->bit;
+ tracer_flags->val &= ~opts->bit;
else
- trace_flags->val |= opts->bit;
+ tracer_flags->val |= opts->bit;
return 0;
}
@@ -3631,9 +3457,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
trace_seq_printf(s, "commit overrun: %ld\n", cnt);
- cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
- trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
-
count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
kfree(s);
@@ -4271,7 +4094,6 @@ void ftrace_dump(void)
__init static int tracer_alloc_buffers(void)
{
- struct trace_array_cpu *data;
int ring_buf_size;
int i;
int ret = -ENOMEM;
@@ -4320,7 +4142,7 @@ __init static int tracer_alloc_buffers(void)
/* Allocate the first page for all buffers */
for_each_tracing_cpu(i) {
- data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
+ global_trace.data[i] = &per_cpu(global_trace_cpu, i);
max_tr.data[i] = &per_cpu(max_data, i);
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8a6281b2330b..3007b7cf798b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -35,8 +35,6 @@ enum trace_type {
TRACE_GRAPH_ENT,
TRACE_USER_STACK,
TRACE_HW_BRANCHES,
- TRACE_SYSCALL_ENTER,
- TRACE_SYSCALL_EXIT,
TRACE_KMEM_ALLOC,
TRACE_KMEM_FREE,
TRACE_POWER,
@@ -336,10 +334,6 @@ extern void __ftrace_bad_type(void);
TRACE_KMEM_ALLOC); \
IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
TRACE_KMEM_FREE); \
- IF_ASSIGN(var, ent, struct syscall_trace_enter, \
- TRACE_SYSCALL_ENTER); \
- IF_ASSIGN(var, ent, struct syscall_trace_exit, \
- TRACE_SYSCALL_EXIT); \
IF_ASSIGN(var, ent, struct trace_skb_event, \
TRACE_SKB_SOURCE); \
__ftrace_bad_type(); \
@@ -486,6 +480,7 @@ void trace_function(struct trace_array *tr,
void trace_graph_return(struct ftrace_graph_ret *trace);
int trace_graph_entry(struct ftrace_graph_ent *trace);
+void set_graph_array(struct trace_array *tr);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
@@ -504,9 +499,31 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
-void __trace_stack(struct trace_array *tr,
- unsigned long flags,
- int skip, int pc);
+#ifdef CONFIG_STACKTRACE
+void ftrace_trace_stack(struct trace_array *tr, unsigned long flags,
+ int skip, int pc);
+
+void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags,
+ int pc);
+
+void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
+ int pc);
+#else
+static inline void ftrace_trace_stack(struct trace_array *tr,
+ unsigned long flags, int skip, int pc)
+{
+}
+
+static inline void ftrace_trace_userstack(struct trace_array *tr,
+ unsigned long flags, int pc)
+{
+}
+
+static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
+ int skip, int pc)
+{
+}
+#endif /* CONFIG_STACKTRACE */
extern cycle_t ftrace_now(int cpu);
@@ -532,6 +549,10 @@ extern unsigned long ftrace_update_tot_cnt;
extern int DYN_FTRACE_TEST_NAME(void);
#endif
+extern int ring_buffer_expanded;
+extern bool tracing_selftest_disabled;
+DECLARE_PER_CPU(local_t, ftrace_cpu_disabled);
+
#ifdef CONFIG_FTRACE_STARTUP_TEST
extern int trace_selftest_startup_function(struct tracer *trace,
struct trace_array *tr);
@@ -762,13 +783,15 @@ struct event_filter {
int n_preds;
struct filter_pred **preds;
char *filter_string;
+ bool no_reset;
};
struct event_subsystem {
struct list_head list;
const char *name;
struct dentry *entry;
- void *filter;
+ struct event_filter *filter;
+ int nr_events;
};
struct filter_pred;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e75276a49cf5..b568ade8f453 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -17,6 +17,8 @@
#include <linux/ctype.h>
#include <linux/delay.h>
+#include <asm/setup.h>
+
#include "trace_output.h"
#define TRACE_SYSTEM "TRACE_SYSTEM"
@@ -84,14 +86,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
if (call->enabled) {
call->enabled = 0;
tracing_stop_cmdline_record();
- call->unregfunc();
+ call->unregfunc(call->data);
}
break;
case 1:
if (!call->enabled) {
call->enabled = 1;
tracing_start_cmdline_record();
- call->regfunc();
+ call->regfunc(call->data);
}
break;
}
@@ -574,7 +576,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_printf(s, "format:\n");
trace_write_header(s);
- r = call->show_format(s);
+ r = call->show_format(call, s);
if (!r) {
/*
* ug! The format output is bigger than a PAGE!!
@@ -849,8 +851,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
/* First see if we did not already create this dir */
list_for_each_entry(system, &event_subsystems, list) {
- if (strcmp(system->name, name) == 0)
+ if (strcmp(system->name, name) == 0) {
+ system->nr_events++;
return system->entry;
+ }
}
/* need to create new entry */
@@ -869,6 +873,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
return d_events;
}
+ system->nr_events = 1;
system->name = kstrdup(name, GFP_KERNEL);
if (!system->name) {
debugfs_remove(system->entry);
@@ -920,15 +925,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
if (strcmp(call->system, TRACE_SYSTEM) != 0)
d_events = event_subsystem_dir(call->system, d_events);
- if (call->raw_init) {
- ret = call->raw_init();
- if (ret < 0) {
- pr_warning("Could not initialize trace point"
- " events/%s\n", call->name);
- return ret;
- }
- }
-
call->dir = debugfs_create_dir(call->name, d_events);
if (!call->dir) {
pr_warning("Could not create debugfs "
@@ -987,6 +983,32 @@ struct ftrace_module_file_ops {
struct file_operations filter;
};
+static void remove_subsystem_dir(const char *name)
+{
+ struct event_subsystem *system;
+
+ if (strcmp(name, TRACE_SYSTEM) == 0)
+ return;
+
+ list_for_each_entry(system, &event_subsystems, list) {
+ if (strcmp(system->name, name) == 0) {
+ if (!--system->nr_events) {
+ struct event_filter *filter = system->filter;
+
+ debugfs_remove_recursive(system->entry);
+ list_del(&system->list);
+ if (filter) {
+ kfree(filter->filter_string);
+ kfree(filter);
+ }
+ kfree(system->name);
+ kfree(system);
+ }
+ break;
+ }
+ }
+}
+
static struct ftrace_module_file_ops *
trace_create_file_ops(struct module *mod)
{
@@ -1027,6 +1049,7 @@ static void trace_module_add_events(struct module *mod)
struct ftrace_module_file_ops *file_ops = NULL;
struct ftrace_event_call *call, *start, *end;
struct dentry *d_events;
+ int ret;
start = mod->trace_events;
end = mod->trace_events + mod->num_trace_events;
@@ -1042,7 +1065,15 @@ static void trace_module_add_events(struct module *mod)
/* The linker may leave blanks */
if (!call->name)
continue;
-
+ if (call->raw_init) {
+ ret = call->raw_init();
+ if (ret < 0) {
+ if (ret != -ENOSYS)
+ pr_warning("Could not initialize trace "
+ "point events/%s\n", call->name);
+ continue;
+ }
+ }
/*
* This module has events, create file ops for this module
* if not already done.
@@ -1077,6 +1108,7 @@ static void trace_module_remove_events(struct module *mod)
list_del(&call->list);
trace_destroy_fields(call);
destroy_preds(call);
+ remove_subsystem_dir(call->system);
}
}
@@ -1133,6 +1165,18 @@ struct notifier_block trace_module_nb = {
extern struct ftrace_event_call __start_ftrace_events[];
extern struct ftrace_event_call __stop_ftrace_events[];
+static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
+
+static __init int setup_trace_event(char *str)
+{
+ strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
+ ring_buffer_expanded = 1;
+ tracing_selftest_disabled = 1;
+
+ return 1;
+}
+__setup("trace_event=", setup_trace_event);
+
static __init int event_trace_init(void)
{
struct ftrace_event_call *call;
@@ -1140,6 +1184,8 @@ static __init int event_trace_init(void)
struct dentry *entry;
struct dentry *d_events;
int ret;
+ char *buf = bootup_event_buf;
+ char *token;
d_tracer = tracing_init_dentry();
if (!d_tracer)
@@ -1179,12 +1225,34 @@ static __init int event_trace_init(void)
/* The linker may leave blanks */
if (!call->name)
continue;
+ if (call->raw_init) {
+ ret = call->raw_init();
+ if (ret < 0) {
+ if (ret != -ENOSYS)
+ pr_warning("Could not initialize trace "
+ "point events/%s\n", call->name);
+ continue;
+ }
+ }
list_add(&call->list, &ftrace_events);
event_create_dir(call, d_events, &ftrace_event_id_fops,
&ftrace_enable_fops, &ftrace_event_filter_fops,
&ftrace_event_format_fops);
}
+ while (true) {
+ token = strsep(&buf, ",");
+
+ if (!token)
+ break;
+ if (!*token)
+ continue;
+
+ ret = ftrace_set_clr_event(token, 1);
+ if (ret)
+ pr_warning("Failed to enable trace event: %s\n", token);
+ }
+
ret = register_module_notifier(&trace_module_nb);
if (ret)
pr_warning("Failed to register trace events module notifier\n");
@@ -1392,10 +1460,10 @@ static __init void event_trace_self_test_with_function(void)
static __init int event_trace_self_tests_init(void)
{
-
- event_trace_self_tests();
-
- event_trace_self_test_with_function();
+ if (!tracing_selftest_disabled) {
+ event_trace_self_tests();
+ event_trace_self_test_with_function();
+ }
return 0;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f32dc9d1ea7b..490337abed75 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -176,11 +176,13 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
static int filter_pred_strloc(struct filter_pred *pred, void *event,
int val1, int val2)
{
- unsigned short str_loc = *(unsigned short *)(event + pred->offset);
+ u32 str_item = *(u32 *)(event + pred->offset);
+ int str_loc = str_item & 0xffff;
+ int str_len = str_item >> 16;
char *addr = (char *)(event + str_loc);
int cmp, match;
- cmp = strncmp(addr, pred->str_val, pred->str_len);
+ cmp = strncmp(addr, pred->str_val, str_len);
match = (!cmp) ^ pred->not;
@@ -418,24 +420,29 @@ oom:
}
EXPORT_SYMBOL_GPL(init_preds);
-static void filter_free_subsystem_preds(struct event_subsystem *system)
+enum {
+ FILTER_DISABLE_ALL,
+ FILTER_INIT_NO_RESET,
+ FILTER_SKIP_NO_RESET,
+};
+
+static void filter_free_subsystem_preds(struct event_subsystem *system,
+ int flag)
{
- struct event_filter *filter = system->filter;
struct ftrace_event_call *call;
- int i;
-
- if (filter->n_preds) {
- for (i = 0; i < filter->n_preds; i++)
- filter_free_pred(filter->preds[i]);
- kfree(filter->preds);
- filter->preds = NULL;
- filter->n_preds = 0;
- }
list_for_each_entry(call, &ftrace_events, list) {
if (!call->define_fields)
continue;
+ if (flag == FILTER_INIT_NO_RESET) {
+ call->filter->no_reset = false;
+ continue;
+ }
+
+ if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
+ continue;
+
if (!strcmp(call->system, system->name)) {
filter_disable_preds(call);
remove_filter_string(call->filter);
@@ -537,7 +544,8 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
static int filter_add_pred(struct filter_parse_state *ps,
struct ftrace_event_call *call,
- struct filter_pred *pred)
+ struct filter_pred *pred,
+ bool dry_run)
{
struct ftrace_event_field *field;
filter_pred_fn_t fn;
@@ -549,10 +557,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
if (pred->op == OP_AND) {
pred->pop_n = 2;
- return filter_add_pred_fn(ps, call, pred, filter_pred_and);
+ fn = filter_pred_and;
+ goto add_pred_fn;
} else if (pred->op == OP_OR) {
pred->pop_n = 2;
- return filter_add_pred_fn(ps, call, pred, filter_pred_or);
+ fn = filter_pred_or;
+ goto add_pred_fn;
}
field = find_event_field(call, pred->field_name);
@@ -575,9 +585,6 @@ static int filter_add_pred(struct filter_parse_state *ps,
else
fn = filter_pred_strloc;
pred->str_len = field->size;
- if (pred->op == OP_NE)
- pred->not = 1;
- return filter_add_pred_fn(ps, call, pred, fn);
} else {
if (field->is_signed)
ret = strict_strtoll(pred->str_val, 0, &val);
@@ -588,41 +595,33 @@ static int filter_add_pred(struct filter_parse_state *ps,
return -EINVAL;
}
pred->val = val;
- }
- fn = select_comparison_fn(pred->op, field->size, field->is_signed);
- if (!fn) {
- parse_error(ps, FILT_ERR_INVALID_OP, 0);
- return -EINVAL;
+ fn = select_comparison_fn(pred->op, field->size,
+ field->is_signed);
+ if (!fn) {
+ parse_error(ps, FILT_ERR_INVALID_OP, 0);
+ return -EINVAL;
+ }
}
if (pred->op == OP_NE)
pred->not = 1;
- return filter_add_pred_fn(ps, call, pred, fn);
+add_pred_fn:
+ if (!dry_run)
+ return filter_add_pred_fn(ps, call, pred, fn);
+ return 0;
}
static int filter_add_subsystem_pred(struct filter_parse_state *ps,
struct event_subsystem *system,
struct filter_pred *pred,
- char *filter_string)
+ char *filter_string,
+ bool dry_run)
{
- struct event_filter *filter = system->filter;
struct ftrace_event_call *call;
int err = 0;
-
- if (!filter->preds) {
- filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
- GFP_KERNEL);
-
- if (!filter->preds)
- return -ENOMEM;
- }
-
- if (filter->n_preds == MAX_FILTER_PRED) {
- parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
- return -ENOSPC;
- }
+ bool fail = true;
list_for_each_entry(call, &ftrace_events, list) {
@@ -632,19 +631,24 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
if (strcmp(call->system, system->name))
continue;
- err = filter_add_pred(ps, call, pred);
- if (err) {
- filter_free_subsystem_preds(system);
- parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
- goto out;
- }
- replace_filter_string(call->filter, filter_string);
+ if (call->filter->no_reset)
+ continue;
+
+ err = filter_add_pred(ps, call, pred, dry_run);
+ if (err)
+ call->filter->no_reset = true;
+ else
+ fail = false;
+
+ if (!dry_run)
+ replace_filter_string(call->filter, filter_string);
}
- filter->preds[filter->n_preds] = pred;
- filter->n_preds++;
-out:
- return err;
+ if (fail) {
+ parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ return err;
+ }
+ return 0;
}
static void parse_init(struct filter_parse_state *ps,
@@ -1003,12 +1007,14 @@ static int check_preds(struct filter_parse_state *ps)
static int replace_preds(struct event_subsystem *system,
struct ftrace_event_call *call,
struct filter_parse_state *ps,
- char *filter_string)
+ char *filter_string,
+ bool dry_run)
{
char *operand1 = NULL, *operand2 = NULL;
struct filter_pred *pred;
struct postfix_elt *elt;
int err;
+ int n_preds = 0;
err = check_preds(ps);
if (err)
@@ -1027,24 +1033,14 @@ static int replace_preds(struct event_subsystem *system,
continue;
}
+ if (n_preds++ == MAX_FILTER_PRED) {
+ parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+ return -ENOSPC;
+ }
+
if (elt->op == OP_AND || elt->op == OP_OR) {
pred = create_logical_pred(elt->op);
- if (!pred)
- return -ENOMEM;
- if (call) {
- err = filter_add_pred(ps, call, pred);
- filter_free_pred(pred);
- } else {
- err = filter_add_subsystem_pred(ps, system,
- pred, filter_string);
- if (err)
- filter_free_pred(pred);
- }
- if (err)
- return err;
-
- operand1 = operand2 = NULL;
- continue;
+ goto add_pred;
}
if (!operand1 || !operand2) {
@@ -1053,17 +1049,15 @@ static int replace_preds(struct event_subsystem *system,
}
pred = create_pred(elt->op, operand1, operand2);
+add_pred:
if (!pred)
return -ENOMEM;
- if (call) {
- err = filter_add_pred(ps, call, pred);
- filter_free_pred(pred);
- } else {
+ if (call)
+ err = filter_add_pred(ps, call, pred, false);
+ else
err = filter_add_subsystem_pred(ps, system, pred,
- filter_string);
- if (err)
- filter_free_pred(pred);
- }
+ filter_string, dry_run);
+ filter_free_pred(pred);
if (err)
return err;
@@ -1103,7 +1097,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
goto out;
}
- err = replace_preds(NULL, call, ps, filter_string);
+ err = replace_preds(NULL, call, ps, filter_string, false);
if (err)
append_filter_err(ps, call->filter);
@@ -1127,7 +1121,7 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
mutex_lock(&event_mutex);
if (!strcmp(strstrip(filter_string), "0")) {
- filter_free_subsystem_preds(system);
+ filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
remove_filter_string(system->filter);
mutex_unlock(&event_mutex);
return 0;
@@ -1138,7 +1132,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
if (!ps)
goto out_unlock;
- filter_free_subsystem_preds(system);
replace_filter_string(system->filter, filter_string);
parse_init(ps, filter_ops, filter_string);
@@ -1148,9 +1141,23 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
goto out;
}
- err = replace_preds(system, NULL, ps, filter_string);
- if (err)
+ filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
+
+ /* try to see the filter can be applied to which events */
+ err = replace_preds(system, NULL, ps, filter_string, true);
+ if (err) {
append_filter_err(ps, system->filter);
+ goto out;
+ }
+
+ filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
+
+ /* really apply the filter to the events */
+ err = replace_preds(system, NULL, ps, filter_string, false);
+ if (err) {
+ append_filter_err(ps, system->filter);
+ filter_free_subsystem_preds(system, 2);
+ }
out:
filter_opstack_clear(ps);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d06cf898dc86..956d4bc675e5 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -60,7 +60,8 @@ extern void __bad_type_size(void);
#undef TRACE_EVENT_FORMAT
#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
static int \
-ftrace_format_##call(struct trace_seq *s) \
+ftrace_format_##call(struct ftrace_event_call *unused, \
+ struct trace_seq *s) \
{ \
struct args field; \
int ret; \
@@ -76,7 +77,8 @@ ftrace_format_##call(struct trace_seq *s) \
#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
tpfmt) \
static int \
-ftrace_format_##call(struct trace_seq *s) \
+ftrace_format_##call(struct ftrace_event_call *unused, \
+ struct trace_seq *s) \
{ \
struct args field; \
int ret; \
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 75ef000613c3..5b01b94518fc 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -288,11 +288,9 @@ static int
ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data)
{
- char str[KSYM_SYMBOL_LEN];
long count = (long)data;
- kallsyms_lookup(ip, NULL, NULL, NULL, str);
- seq_printf(m, "%s:", str);
+ seq_printf(m, "%pf:", (void *)ip);
if (ops == &traceon_probe_ops)
seq_printf(m, "traceon");
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 420ec3487579..3f4a251b7d16 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -52,7 +52,7 @@ static struct tracer_flags tracer_flags = {
.opts = trace_opts
};
-/* pid on the last trace processed */
+static struct trace_array *graph_array;
/* Add a function return address to the trace stack on thread info.*/
@@ -166,10 +166,121 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
return ret;
}
+static int __trace_graph_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned long flags,
+ int pc)
+{
+ struct ftrace_event_call *call = &event_funcgraph_entry;
+ struct ring_buffer_event *event;
+ struct ftrace_graph_ent_entry *entry;
+
+ if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+ return 0;
+
+ event = trace_buffer_lock_reserve(tr, TRACE_GRAPH_ENT,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return 0;
+ entry = ring_buffer_event_data(event);
+ entry->graph_ent = *trace;
+ if (!filter_current_check_discard(call, entry, event))
+ ring_buffer_unlock_commit(tr->buffer, event);
+
+ return 1;
+}
+
+int trace_graph_entry(struct ftrace_graph_ent *trace)
+{
+ struct trace_array *tr = graph_array;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int ret;
+ int cpu;
+ int pc;
+
+ if (unlikely(!tr))
+ return 0;
+
+ if (!ftrace_trace_task(current))
+ return 0;
+
+ if (!ftrace_graph_addr(trace->func))
+ return 0;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ ret = __trace_graph_entry(tr, trace, flags, pc);
+ } else {
+ ret = 0;
+ }
+ /* Only do the atomic if it is not already set */
+ if (!test_tsk_trace_graph(current))
+ set_tsk_trace_graph(current);
+
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void __trace_graph_return(struct trace_array *tr,
+ struct ftrace_graph_ret *trace,
+ unsigned long flags,
+ int pc)
+{
+ struct ftrace_event_call *call = &event_funcgraph_exit;
+ struct ring_buffer_event *event;
+ struct ftrace_graph_ret_entry *entry;
+
+ if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+ return;
+
+ event = trace_buffer_lock_reserve(tr, TRACE_GRAPH_RET,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->ret = *trace;
+ if (!filter_current_check_discard(call, entry, event))
+ ring_buffer_unlock_commit(tr->buffer, event);
+}
+
+void trace_graph_return(struct ftrace_graph_ret *trace)
+{
+ struct trace_array *tr = graph_array;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ __trace_graph_return(tr, trace, flags, pc);
+ }
+ if (!trace->depth)
+ clear_tsk_trace_graph(current);
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+}
+
static int graph_trace_init(struct trace_array *tr)
{
- int ret = register_ftrace_graph(&trace_graph_return,
- &trace_graph_entry);
+ int ret;
+
+ graph_array = tr;
+ ret = register_ftrace_graph(&trace_graph_return,
+ &trace_graph_entry);
if (ret)
return ret;
tracing_start_cmdline_record();
@@ -177,49 +288,30 @@ static int graph_trace_init(struct trace_array *tr)
return 0;
}
+void set_graph_array(struct trace_array *tr)
+{
+ graph_array = tr;
+}
+
static void graph_trace_reset(struct trace_array *tr)
{
tracing_stop_cmdline_record();
unregister_ftrace_graph();
}
-static inline int log10_cpu(int nb)
-{
- if (nb / 100)
- return 3;
- if (nb / 10)
- return 2;
- return 1;
-}
+static int max_bytes_for_cpu;
static enum print_line_t
print_graph_cpu(struct trace_seq *s, int cpu)
{
- int i;
int ret;
- int log10_this = log10_cpu(cpu);
- int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
-
/*
* Start with a space character - to make it stand out
* to the right a bit when trace output is pasted into
* email:
*/
- ret = trace_seq_printf(s, " ");
-
- /*
- * Tricky - we space the CPU field according to the max
- * number of online CPUs. On a 2-cpu system it would take
- * a maximum of 1 digit - on a 128 cpu system it would
- * take up to 3 digits:
- */
- for (i = 0; i < log10_all - log10_this; i++) {
- ret = trace_seq_printf(s, " ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- ret = trace_seq_printf(s, "%d) ", cpu);
+ ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -565,11 +657,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
- ret = seq_print_ip_sym(s, call->func, 0);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_printf(s, "();\n");
+ ret = trace_seq_printf(s, "%pf();\n", (void *)call->func);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -612,11 +700,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
- ret = seq_print_ip_sym(s, call->func, 0);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_printf(s, "() {\n");
+ ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -934,6 +1018,8 @@ static struct tracer graph_trace __read_mostly = {
static __init int init_graph_trace(void)
{
+ max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
+
return register_tracer(&graph_trace);
}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a98106dd979c..e1285d7b5488 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -20,6 +20,34 @@ static int sched_ref;
static DEFINE_MUTEX(sched_register_mutex);
static int sched_stopped;
+
+void
+tracing_sched_switch_trace(struct trace_array *tr,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_context_switch;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+
+ event = trace_buffer_lock_reserve(tr, TRACE_CTX,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = prev->pid;
+ entry->prev_prio = prev->prio;
+ entry->prev_state = prev->state;
+ entry->next_pid = next->pid;
+ entry->next_prio = next->prio;
+ entry->next_state = next->state;
+ entry->next_cpu = task_cpu(next);
+
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ trace_buffer_unlock_commit(tr, event, flags, pc);
+}
+
static void
probe_sched_switch(struct rq *__rq, struct task_struct *prev,
struct task_struct *next)
@@ -49,6 +77,35 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
local_irq_restore(flags);
}
+void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+ struct task_struct *wakee,
+ struct task_struct *curr,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_wakeup;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+
+ event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = curr->pid;
+ entry->prev_prio = curr->prio;
+ entry->prev_state = curr->state;
+ entry->next_pid = wakee->pid;
+ entry->next_prio = wakee->prio;
+ entry->next_state = wakee->state;
+ entry->next_cpu = task_cpu(wakee);
+
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ ring_buffer_unlock_commit(tr->buffer, event);
+ ftrace_trace_stack(tr, flags, 6, pc);
+ ftrace_trace_userstack(tr, flags, pc);
+}
+
static void
probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
{
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 00dd6485bdd7..d2cdbabb4ead 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -288,6 +288,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
* to detect and recover from possible hangs
*/
tracing_reset_online_cpus(tr);
+ set_graph_array(tr);
ret = register_ftrace_graph(&trace_graph_return,
&trace_graph_entry_watchdog);
if (ret) {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 6a2a9d484cd6..0f6facb050a1 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -186,43 +186,33 @@ static const struct file_operations stack_max_size_fops = {
};
static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
+__next(struct seq_file *m, loff_t *pos)
{
- long i;
+ long n = *pos - 1;
- (*pos)++;
-
- if (v == SEQ_START_TOKEN)
- i = 0;
- else {
- i = *(long *)v;
- i++;
- }
-
- if (i >= max_stack_trace.nr_entries ||
- stack_dump_trace[i] == ULONG_MAX)
+ if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
return NULL;
- m->private = (void *)i;
-
+ m->private = (void *)n;
return &m->private;
}
-static void *t_start(struct seq_file *m, loff_t *pos)
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
{
- void *t = SEQ_START_TOKEN;
- loff_t l = 0;
+ (*pos)++;
+ return __next(m, pos);
+}
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
local_irq_disable();
__raw_spin_lock(&max_stack_lock);
if (*pos == 0)
return SEQ_START_TOKEN;
- for (; t && l < *pos; t = t_next(m, t, &l))
- ;
-
- return t;
+ return __next(m, pos);
}
static void t_stop(struct seq_file *m, void *p)
@@ -234,15 +224,8 @@ static void t_stop(struct seq_file *m, void *p)
static int trace_lookup_stack(struct seq_file *m, long i)
{
unsigned long addr = stack_dump_trace[i];
-#ifdef CONFIG_KALLSYMS
- char str[KSYM_SYMBOL_LEN];
-
- sprint_symbol(str, addr);
- return seq_printf(m, "%s\n", str);
-#else
- return seq_printf(m, "%p\n", (void*)addr);
-#endif
+ return seq_printf(m, "%pF\n", (void *)addr);
}
static void print_disabled(struct seq_file *m)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index aea321c82fa0..a4bb239eb987 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -49,7 +49,8 @@ static struct dentry *stat_dir;
* but it will at least advance closer to the next one
* to be released.
*/
-static struct rb_node *release_next(struct rb_node *node)
+static struct rb_node *release_next(struct tracer_stat *ts,
+ struct rb_node *node)
{
struct stat_node *snode;
struct rb_node *parent = rb_parent(node);
@@ -67,6 +68,8 @@ static struct rb_node *release_next(struct rb_node *node)
parent->rb_right = NULL;
snode = container_of(node, struct stat_node, node);
+ if (ts->stat_release)
+ ts->stat_release(snode->stat);
kfree(snode);
return parent;
@@ -78,7 +81,7 @@ static void __reset_stat_session(struct stat_session *session)
struct rb_node *node = session->stat_root.rb_node;
while (node)
- node = release_next(node);
+ node = release_next(session->ts, node);
session->stat_root = RB_ROOT;
}
@@ -200,17 +203,21 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
{
struct stat_session *session = s->private;
struct rb_node *node;
+ int n = *pos;
int i;
/* Prevent from tracer switch or rbtree modification */
mutex_lock(&session->stat_mutex);
/* If we are in the beginning of the file, print the headers */
- if (!*pos && session->ts->stat_headers)
- return SEQ_START_TOKEN;
+ if (session->ts->stat_headers) {
+ if (n == 0)
+ return SEQ_START_TOKEN;
+ n--;
+ }
node = rb_first(&session->stat_root);
- for (i = 0; node && i < *pos; i++)
+ for (i = 0; node && i < n; i++)
node = rb_next(node);
return node;
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index f3546a2cd826..8f03914b9a6a 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -18,6 +18,8 @@ struct tracer_stat {
int (*stat_cmp)(void *p1, void *p2);
/* Print a stat entry */
int (*stat_show)(struct seq_file *s, void *p);
+ /* Release an entry */
+ void (*stat_release)(void *stat);
/* Print the headers of your stat entries */
int (*stat_headers)(struct seq_file *s);
};
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e579645ac86..f130dacfeef4 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,30 +1,17 @@
#include <trace/syscall.h>
#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/perf_counter.h>
#include <asm/syscall.h>
#include "trace_output.h"
#include "trace.h"
-/* Keep a counter of the syscall tracing users */
-static int refcount;
-
-/* Prevent from races on thread flags toggling */
static DEFINE_MUTEX(syscall_trace_lock);
-
-/* Option to display the parameters types */
-enum {
- TRACE_SYSCALLS_OPT_TYPES = 0x1,
-};
-
-static struct tracer_opt syscalls_opts[] = {
- { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
- { }
-};
-
-static struct tracer_flags syscalls_flags = {
- .val = 0, /* By default: no parameters types */
- .opts = syscalls_opts
-};
+static int sys_refcount_enter;
+static int sys_refcount_exit;
+static DECLARE_BITMAP(enabled_enter_syscalls, FTRACE_SYSCALL_MAX);
+static DECLARE_BITMAP(enabled_exit_syscalls, FTRACE_SYSCALL_MAX);
enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags)
@@ -35,21 +22,25 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
struct syscall_metadata *entry;
int i, ret, syscall;
- trace_assign_type(trace, ent);
-
+ trace = (typeof(trace))ent;
syscall = trace->nr;
-
entry = syscall_nr_to_meta(syscall);
+
if (!entry)
goto end;
+ if (entry->enter_id != ent->type) {
+ WARN_ON_ONCE(1);
+ goto end;
+ }
+
ret = trace_seq_printf(s, "%s(", entry->name);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
for (i = 0; i < entry->nb_args; i++) {
/* parameter types */
- if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) {
+ if (trace_flags & TRACE_ITER_VERBOSE) {
ret = trace_seq_printf(s, "%s ", entry->types[i]);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -77,16 +68,20 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
struct syscall_metadata *entry;
int ret;
- trace_assign_type(trace, ent);
-
+ trace = (typeof(trace))ent;
syscall = trace->nr;
-
entry = syscall_nr_to_meta(syscall);
+
if (!entry) {
trace_seq_printf(s, "\n");
return TRACE_TYPE_HANDLED;
}
+ if (entry->exit_id != ent->type) {
+ WARN_ON_ONCE(1);
+ return TRACE_TYPE_UNHANDLED;
+ }
+
ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
trace->ret);
if (!ret)
@@ -95,54 +90,53 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
-void start_ftrace_syscalls(void)
-{
- unsigned long flags;
- struct task_struct *g, *t;
-
- mutex_lock(&syscall_trace_lock);
-
- /* Don't enable the flag on the tasks twice */
- if (++refcount != 1)
- goto unlock;
-
- arch_init_ftrace_syscalls();
- read_lock_irqsave(&tasklist_lock, flags);
-
- do_each_thread(g, t) {
- set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
- } while_each_thread(g, t);
-
- read_unlock_irqrestore(&tasklist_lock, flags);
-
-unlock:
- mutex_unlock(&syscall_trace_lock);
-}
-
-void stop_ftrace_syscalls(void)
+int ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s)
{
- unsigned long flags;
- struct task_struct *g, *t;
+ int i;
+ int nr;
+ int ret = 0;
+ struct syscall_metadata *entry;
+ int offset = sizeof(struct trace_entry);
- mutex_lock(&syscall_trace_lock);
+ nr = syscall_name_to_nr((char *)call->data);
+ entry = syscall_nr_to_meta(nr);
- /* There are perhaps still some users */
- if (--refcount)
- goto unlock;
+ if (!entry)
+ return ret;
- read_lock_irqsave(&tasklist_lock, flags);
+ for (i = 0; i < entry->nb_args; i++) {
+ ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
+ entry->args[i]);
+ if (!ret)
+ return 0;
+ ret = trace_seq_printf(s, "\toffset:%d;\tsize:%lu;\n", offset,
+ sizeof(unsigned long));
+ if (!ret)
+ return 0;
+ offset += sizeof(unsigned long);
+ }
- do_each_thread(g, t) {
- clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
- } while_each_thread(g, t);
+ trace_seq_printf(s, "\nprint fmt: \"");
+ for (i = 0; i < entry->nb_args; i++) {
+ ret = trace_seq_printf(s, "%s: 0x%%0%lulx%s", entry->args[i],
+ sizeof(unsigned long),
+ i == entry->nb_args - 1 ? "\", " : ", ");
+ if (!ret)
+ return 0;
+ }
- read_unlock_irqrestore(&tasklist_lock, flags);
+ for (i = 0; i < entry->nb_args; i++) {
+ ret = trace_seq_printf(s, "((unsigned long)(REC->%s))%s",
+ entry->args[i],
+ i == entry->nb_args - 1 ? "\n" : ", ");
+ if (!ret)
+ return 0;
+ }
-unlock:
- mutex_unlock(&syscall_trace_lock);
+ return ret;
}
-void ftrace_syscall_enter(struct pt_regs *regs)
+void ftrace_syscall_enter(struct pt_regs *regs, long id)
{
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
@@ -151,6 +145,8 @@ void ftrace_syscall_enter(struct pt_regs *regs)
int syscall_nr;
syscall_nr = syscall_get_nr(current, regs);
+ if (!test_bit(syscall_nr, enabled_enter_syscalls))
+ return;
sys_data = syscall_nr_to_meta(syscall_nr);
if (!sys_data)
@@ -158,7 +154,7 @@ void ftrace_syscall_enter(struct pt_regs *regs)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
- event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size,
+ event = trace_current_buffer_lock_reserve(sys_data->enter_id, size,
0, 0);
if (!event)
return;
@@ -171,7 +167,7 @@ void ftrace_syscall_enter(struct pt_regs *regs)
trace_wake_up();
}
-void ftrace_syscall_exit(struct pt_regs *regs)
+void ftrace_syscall_exit(struct pt_regs *regs, long ret)
{
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
@@ -179,12 +175,14 @@ void ftrace_syscall_exit(struct pt_regs *regs)
int syscall_nr;
syscall_nr = syscall_get_nr(current, regs);
+ if (!test_bit(syscall_nr, enabled_exit_syscalls))
+ return;
sys_data = syscall_nr_to_meta(syscall_nr);
if (!sys_data)
return;
- event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT,
+ event = trace_current_buffer_lock_reserve(sys_data->exit_id,
sizeof(*entry), 0, 0);
if (!event)
return;
@@ -197,54 +195,247 @@ void ftrace_syscall_exit(struct pt_regs *regs)
trace_wake_up();
}
-static int init_syscall_tracer(struct trace_array *tr)
+int reg_event_syscall_enter(void *ptr)
{
- start_ftrace_syscalls();
+ int ret = 0;
+ int num;
+ char *name;
+
+ name = (char *)ptr;
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= FTRACE_SYSCALL_MAX)
+ return -ENOSYS;
+ mutex_lock(&syscall_trace_lock);
+ if (!sys_refcount_enter)
+ ret = register_trace_syscall_enter(ftrace_syscall_enter);
+ if (ret) {
+ pr_info("event trace: Could not activate"
+ "syscall entry trace point");
+ } else {
+ set_bit(num, enabled_enter_syscalls);
+ sys_refcount_enter++;
+ }
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
+}
+
+void unreg_event_syscall_enter(void *ptr)
+{
+ int num;
+ char *name;
+
+ name = (char *)ptr;
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= FTRACE_SYSCALL_MAX)
+ return;
+ mutex_lock(&syscall_trace_lock);
+ sys_refcount_enter--;
+ clear_bit(num, enabled_enter_syscalls);
+ if (!sys_refcount_enter)
+ unregister_trace_syscall_enter(ftrace_syscall_enter);
+ mutex_unlock(&syscall_trace_lock);
+}
- return 0;
+int reg_event_syscall_exit(void *ptr)
+{
+ int ret = 0;
+ int num;
+ char *name;
+
+ name = (char *)ptr;
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= FTRACE_SYSCALL_MAX)
+ return -ENOSYS;
+ mutex_lock(&syscall_trace_lock);
+ if (!sys_refcount_exit)
+ ret = register_trace_syscall_exit(ftrace_syscall_exit);
+ if (ret) {
+ pr_info("event trace: Could not activate"
+ "syscall exit trace point");
+ } else {
+ set_bit(num, enabled_exit_syscalls);
+ sys_refcount_exit++;
+ }
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
}
-static void reset_syscall_tracer(struct trace_array *tr)
+void unreg_event_syscall_exit(void *ptr)
{
- stop_ftrace_syscalls();
- tracing_reset_online_cpus(tr);
+ int num;
+ char *name;
+
+ name = (char *)ptr;
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= FTRACE_SYSCALL_MAX)
+ return;
+ mutex_lock(&syscall_trace_lock);
+ sys_refcount_exit--;
+ clear_bit(num, enabled_exit_syscalls);
+ if (!sys_refcount_exit)
+ unregister_trace_syscall_exit(ftrace_syscall_exit);
+ mutex_unlock(&syscall_trace_lock);
}
-static struct trace_event syscall_enter_event = {
- .type = TRACE_SYSCALL_ENTER,
- .trace = print_syscall_enter,
+struct trace_event event_syscall_enter = {
+ .trace = print_syscall_enter,
};
-static struct trace_event syscall_exit_event = {
- .type = TRACE_SYSCALL_EXIT,
- .trace = print_syscall_exit,
+struct trace_event event_syscall_exit = {
+ .trace = print_syscall_exit,
};
-static struct tracer syscall_tracer __read_mostly = {
- .name = "syscall",
- .init = init_syscall_tracer,
- .reset = reset_syscall_tracer,
- .flags = &syscalls_flags,
+#ifdef CONFIG_EVENT_PROFILE
+
+struct syscall_enter_record {
+ struct trace_entry entry;
+ unsigned long args[0];
+};
+
+struct syscall_exit_record {
+ struct trace_entry entry;
+ unsigned long ret;
};
-__init int register_ftrace_syscalls(void)
+static DECLARE_BITMAP(enabled_prof_enter_syscalls, FTRACE_SYSCALL_MAX);
+static DECLARE_BITMAP(enabled_prof_exit_syscalls, FTRACE_SYSCALL_MAX);
+static int sys_prof_refcount_enter;
+static int sys_prof_refcount_exit;
+
+static void prof_syscall_enter(struct pt_regs *regs, long id)
{
- int ret;
+ struct syscall_enter_record *rec;
+ struct syscall_metadata *sys_data;
+ int syscall_nr;
+ int size;
- ret = register_ftrace_event(&syscall_enter_event);
- if (!ret) {
- printk(KERN_WARNING "event %d failed to register\n",
- syscall_enter_event.type);
- WARN_ON_ONCE(1);
+ syscall_nr = syscall_get_nr(current, regs);
+ if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
+ return;
+
+ sys_data = syscall_nr_to_meta(syscall_nr);
+ if (!sys_data)
+ return;
+
+ /* get the size after alignment with the u32 buffer size field */
+ size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
+ size = ALIGN(size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+
+ do {
+ char raw_data[size];
+
+ /* zero the dead bytes from align to not leak stack to user */
+ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+
+ rec = (struct syscall_enter_record *) raw_data;
+ tracing_generic_entry_update(&rec->entry, 0, 0);
+ rec->entry.type = sys_data->enter_id;
+ syscall_get_arguments(current, regs, 0, sys_data->nb_args,
+ (unsigned long *)&rec->args);
+ perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
+ } while(0);
+}
+
+int reg_prof_syscall_enter(char *name)
+{
+ int ret = 0;
+ int num;
+
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= FTRACE_SYSCALL_MAX)
+ return -ENOSYS;
+
+ mutex_lock(&syscall_trace_lock);
+ if (!sys_prof_refcount_enter)
+ ret = register_trace_syscall_enter(prof_syscall_enter);
+ if (ret) {
+ pr_info("event trace: Could not activate"
+ "syscall entry trace point");
+ } else {
+ set_bit(num, enabled_prof_enter_syscalls);
+ sys_prof_refcount_enter++;
}
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
+}
- ret = register_ftrace_event(&syscall_exit_event);
- if (!ret) {
- printk(KERN_WARNING "event %d failed to register\n",
- syscall_exit_event.type);
- WARN_ON_ONCE(1);
+void unreg_prof_syscall_enter(char *name)
+{
+ int num;
+
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= FTRACE_SYSCALL_MAX)
+ return;
+
+ mutex_lock(&syscall_trace_lock);
+ sys_prof_refcount_enter--;
+ clear_bit(num, enabled_prof_enter_syscalls);
+ if (!sys_prof_refcount_enter)
+ unregister_trace_syscall_enter(prof_syscall_enter);
+ mutex_unlock(&syscall_trace_lock);
+}
+
+static void prof_syscall_exit(struct pt_regs *regs, long ret)
+{
+ struct syscall_metadata *sys_data;
+ struct syscall_exit_record rec;
+ int syscall_nr;
+
+ syscall_nr = syscall_get_nr(current, regs);
+ if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
+ return;
+
+ sys_data = syscall_nr_to_meta(syscall_nr);
+ if (!sys_data)
+ return;
+
+ tracing_generic_entry_update(&rec.entry, 0, 0);
+ rec.entry.type = sys_data->exit_id;
+ rec.ret = syscall_get_return_value(current, regs);
+
+ perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
+}
+
+int reg_prof_syscall_exit(char *name)
+{
+ int ret = 0;
+ int num;
+
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= FTRACE_SYSCALL_MAX)
+ return -ENOSYS;
+
+ mutex_lock(&syscall_trace_lock);
+ if (!sys_prof_refcount_exit)
+ ret = register_trace_syscall_exit(prof_syscall_exit);
+ if (ret) {
+ pr_info("event trace: Could not activate"
+ "syscall entry trace point");
+ } else {
+ set_bit(num, enabled_prof_exit_syscalls);
+ sys_prof_refcount_exit++;
}
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
+}
+
+void unreg_prof_syscall_exit(char *name)
+{
+ int num;
+
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= FTRACE_SYSCALL_MAX)
+ return;
- return register_tracer(&syscall_tracer);
+ mutex_lock(&syscall_trace_lock);
+ sys_prof_refcount_exit--;
+ clear_bit(num, enabled_prof_exit_syscalls);
+ if (!sys_prof_refcount_exit)
+ unregister_trace_syscall_exit(prof_syscall_exit);
+ mutex_unlock(&syscall_trace_lock);
}
-device_initcall(register_ftrace_syscalls);
+
+#endif
+
+
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 97fcea4acce1..40cafb07dffd 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
#include <trace/events/workqueue.h>
#include <linux/list.h>
#include <linux/percpu.h>
+#include <linux/kref.h>
#include "trace_stat.h"
#include "trace.h"
@@ -16,6 +17,7 @@
/* A cpu workqueue thread */
struct cpu_workqueue_stats {
struct list_head list;
+ struct kref kref;
int cpu;
pid_t pid;
/* Can be inserted from interrupt or user context, need to be atomic */
@@ -39,6 +41,11 @@ struct workqueue_global_stats {
static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
+static void cpu_workqueue_stat_free(struct kref *kref)
+{
+ kfree(container_of(kref, struct cpu_workqueue_stats, kref));
+}
+
/* Insertion of a work */
static void
probe_workqueue_insertion(struct task_struct *wq_thread,
@@ -96,8 +103,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
return;
}
INIT_LIST_HEAD(&cws->list);
+ kref_init(&cws->kref);
cws->cpu = cpu;
-
cws->pid = wq_thread->pid;
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
@@ -118,7 +125,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
list) {
if (node->pid == wq_thread->pid) {
list_del(&node->list);
- kfree(node);
+ kref_put(&node->kref, cpu_workqueue_stat_free);
goto found;
}
}
@@ -137,9 +144,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- if (!list_empty(&workqueue_cpu_stat(cpu)->list))
+ if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
struct cpu_workqueue_stats, list);
+ kref_get(&ret->kref);
+ }
spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
@@ -162,9 +171,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace)
static void *workqueue_stat_next(void *prev, int idx)
{
struct cpu_workqueue_stats *prev_cws = prev;
+ struct cpu_workqueue_stats *ret;
int cpu = prev_cws->cpu;
unsigned long flags;
- void *ret = NULL;
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
@@ -175,11 +184,14 @@ static void *workqueue_stat_next(void *prev, int idx)
return NULL;
} while (!(ret = workqueue_stat_start_cpu(cpu)));
return ret;
+ } else {
+ ret = list_entry(prev_cws->list.next,
+ struct cpu_workqueue_stats, list);
+ kref_get(&ret->kref);
}
spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
- return list_entry(prev_cws->list.next, struct cpu_workqueue_stats,
- list);
+ return ret;
}
static int workqueue_stat_show(struct seq_file *s, void *p)
@@ -203,6 +215,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
return 0;
}
+static void workqueue_stat_release(void *stat)
+{
+ struct cpu_workqueue_stats *node = stat;
+
+ kref_put(&node->kref, cpu_workqueue_stat_free);
+}
+
static int workqueue_stat_headers(struct seq_file *s)
{
seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
@@ -215,6 +234,7 @@ struct tracer_stat workqueue_stats __read_mostly = {
.stat_start = workqueue_stat_start,
.stat_next = workqueue_stat_next,
.stat_show = workqueue_stat_show,
+ .stat_release = workqueue_stat_release,
.stat_headers = workqueue_stat_headers
};
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index b6bce8613a9c..8f62ead6ba01 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -24,6 +24,7 @@
#include <linux/tracepoint.h>
#include <linux/err.h>
#include <linux/slab.h>
+#include <linux/sched.h>
extern struct tracepoint __start___tracepoints[];
extern struct tracepoint __stop___tracepoints[];
@@ -577,3 +578,43 @@ static int init_tracepoints(void)
__initcall(init_tracepoints);
#endif /* CONFIG_MODULES */
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+
+static DEFINE_MUTEX(regfunc_mutex);
+static int sys_tracepoint_refcount;
+
+void syscall_regfunc(void)
+{
+ unsigned long flags;
+ struct task_struct *g, *t;
+
+ mutex_lock(&regfunc_mutex);
+ if (!sys_tracepoint_refcount) {
+ read_lock_irqsave(&tasklist_lock, flags);
+ do_each_thread(g, t) {
+ set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
+ } while_each_thread(g, t);
+ read_unlock_irqrestore(&tasklist_lock, flags);
+ }
+ sys_tracepoint_refcount++;
+ mutex_unlock(&regfunc_mutex);
+}
+
+void syscall_unregfunc(void)
+{
+ unsigned long flags;
+ struct task_struct *g, *t;
+
+ mutex_lock(&regfunc_mutex);
+ sys_tracepoint_refcount--;
+ if (!sys_tracepoint_refcount) {
+ read_lock_irqsave(&tasklist_lock, flags);
+ do_each_thread(g, t) {
+ clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
+ } while_each_thread(g, t);
+ read_unlock_irqrestore(&tasklist_lock, flags);
+ }
+ mutex_unlock(&regfunc_mutex);
+}
+#endif
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0668795d8818..3c44b56b0da7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -600,7 +600,12 @@ static struct workqueue_struct *keventd_wq __read_mostly;
* schedule_work - put work task in global workqueue
* @work: job to be done
*
- * This puts a job in the kernel-global workqueue.
+ * Returns zero if @work was already on the kernel-global workqueue and
+ * non-zero otherwise.
+ *
+ * This puts a job in the kernel-global workqueue if it was not already
+ * queued and leaves it in the same position on the kernel-global
+ * workqueue otherwise.
*/
int schedule_work(struct work_struct *work)
{