From 6f4156723c084bfc0c0f72205c541fafb8ad3ded Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 5 Oct 2012 12:13:07 -0400 Subject: tracing: Allow tracers to start at core initcall There's times during debugging that it is helpful to see traces of early boot functions. But the tracers are initialized at device_initcall() which is quite late during the boot process. Setting the kernel command line parameter ftrace=function will not show anything until the function tracer is initialized. This prevents being able to trace functions before device_initcall(). There's no reason that the tracers need to be initialized so late in the boot process. Move them up to core_initcall() as they still need to come after early_initcall() which initializes the tracing buffers. Cc: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace/trace_functions_graph.c') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 99b4378393d5..a84b55879bc4 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1474,4 +1474,4 @@ static __init int init_graph_trace(void) return register_tracer(&graph_trace); } -device_initcall(init_graph_trace); +core_initcall(init_graph_trace); -- cgit v1.2.3 From 7ffbd48d5cab22bcd1120eb2349db1319e2d827a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 11 Oct 2012 12:14:25 -0400 Subject: tracing: Cache comms only after an event occurred Whenever an event is registered, the comm of tasks are saved at every task switch instead of saving them at every event. But if an event isn't executed much, the comm cache will be filled up by tasks that did not record the event and you lose out on the comms that did. Here's an example, if you enable the following events: echo 1 > /debug/tracing/events/kvm/kvm_cr/enable echo 1 > /debug/tracing/events/net/net_dev_xmit/enable Note, there's no kvm running on this machine so the first event will never be triggered, but because it is enabled, the storing of comms will continue. If we now disable the network event: echo 0 > /debug/tracing/events/net/net_dev_xmit/enable and look at the trace: cat /debug/tracing/trace sshd-2672 [001] ..s2 375.731616: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s1 375.731617: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s2 375.859356: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s1 375.859357: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s2 375.947351: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s1 375.947352: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s2 376.035383: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s1 376.035383: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s2 377.563806: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=226 rc=0 sshd-2672 [001] ..s1 377.563807: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=226 rc=0 sshd-2672 [001] ..s2 377.563834: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6be0 len=114 rc=0 sshd-2672 [001] ..s1 377.563842: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6be0 len=114 rc=0 We see that process 2672 which triggered the events has the comm "sshd". But if we run hackbench for a bit and look again: cat /debug/tracing/trace <...>-2672 [001] ..s2 375.731616: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s1 375.731617: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s2 375.859356: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s1 375.859357: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s2 375.947351: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s1 375.947352: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s2 376.035383: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s1 376.035383: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s2 377.563806: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=226 rc=0 <...>-2672 [001] ..s1 377.563807: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=226 rc=0 <...>-2672 [001] ..s2 377.563834: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6be0 len=114 rc=0 <...>-2672 [001] ..s1 377.563842: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6be0 len=114 rc=0 The stored "sshd" comm has been flushed out and we get a useless "<...>". But by only storing comms after a trace event occurred, we can run hackbench all day and still get the same output. Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 35 +++++++++++++++++++++++++++-------- kernel/trace/trace.h | 3 +++ kernel/trace/trace_branch.c | 2 +- kernel/trace/trace_functions_graph.c | 4 ++-- 4 files changed, 33 insertions(+), 11 deletions(-) (limited to 'kernel/trace/trace_functions_graph.c') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b90a827a4641..88111b08b2c1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -77,6 +77,13 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) return 0; } +/* + * To prevent the comm cache from being overwritten when no + * tracing is active, only save the comm when a trace event + * occurred. + */ +static DEFINE_PER_CPU(bool, trace_cmdline_save); + /* * Kill all tracing for good (never come back). * It is initialized to 1 but will turn to zero if the initialization @@ -1135,6 +1142,11 @@ void tracing_record_cmdline(struct task_struct *tsk) !tracing_is_on()) return; + if (!__this_cpu_read(trace_cmdline_save)) + return; + + __this_cpu_write(trace_cmdline_save, false); + trace_save_cmdline(tsk); } @@ -1178,13 +1190,20 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, return event; } +void +__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) +{ + __this_cpu_write(trace_cmdline_save, true); + ring_buffer_unlock_commit(buffer, event); +} + static inline void __trace_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc, int wake) { - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); ftrace_trace_userstack(buffer, flags, pc); @@ -1232,7 +1251,7 @@ void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, unsigned long flags, int pc, struct pt_regs *regs) { - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); @@ -1269,7 +1288,7 @@ trace_function(struct trace_array *tr, entry->parent_ip = parent_ip; if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); } void @@ -1362,7 +1381,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, entry->size = trace.nr_entries; if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out: /* Again, don't let gcc optimize things here */ @@ -1458,7 +1477,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) save_stack_trace_user(&trace); if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out_drop_count: __this_cpu_dec(user_stack_count); @@ -1653,7 +1672,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) memcpy(entry->buf, tbuffer, sizeof(u32) * len); if (!filter_check_discard(call, entry, buffer, event)) { - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } @@ -1724,7 +1743,7 @@ int trace_array_vprintk(struct trace_array *tr, memcpy(&entry->buf, tbuffer, len); entry->buf[len] = '\0'; if (!filter_check_discard(call, entry, buffer, event)) { - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } out: @@ -3993,7 +4012,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } else entry->buf[cnt] = '\0'; - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); written = cnt; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7824a55bd3fc..839ae003a053 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -359,6 +359,9 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); +void __buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event); + int trace_empty(struct trace_iterator *iter); void *trace_find_next_entry_inc(struct trace_iterator *iter); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index bd3e0eef4eaa..95e96842ed29 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -77,7 +77,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->correct = val == expect; if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a84b55879bc4..4edb4b74eb7e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -223,7 +223,7 @@ int __trace_graph_entry(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->graph_ent = *trace; if (!filter_current_check_discard(buffer, call, entry, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); return 1; } @@ -327,7 +327,7 @@ void __trace_graph_return(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->ret = *trace; if (!filter_current_check_discard(buffer, call, entry, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); } void trace_graph_return(struct ftrace_graph_ret *trace) -- cgit v1.2.3 From 8741db532e86da2e54f05be751bfe1922ca63d57 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 16 Jan 2013 10:49:37 -0500 Subject: tracing/fgraph: Add max_graph_depth to limit function_graph depth Add the file max_graph_depth to the debug tracing directory that lets the user define the depth of the function graph. A very useful operation is to set the depth to 1. Then it traces only the first function that is called when entering the kernel. This can be used to determine what system operations interrupt a process. For example, to work on NOHZ processes (single tasks running without a timer tick), if any interrupt goes off and preempts that task, this code will show it happening. # cd /sys/kernel/debug/tracing # echo 1 > max_graph_depth # echo function_graph > current_tracer # cat per_cpu/cpu//trace Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 60 ++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) (limited to 'kernel/trace/trace_functions_graph.c') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4edb4b74eb7e..7008d2e13cf2 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -47,6 +47,8 @@ struct fgraph_data { #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 #define TRACE_GRAPH_PRINT_IRQS 0x40 +static unsigned int max_depth; + static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, @@ -250,8 +252,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return 0; /* trace it when it is-nested-in or is a function enabled. */ - if (!(trace->depth || ftrace_graph_addr(trace->func)) || - ftrace_graph_ignore_irqs()) + if ((!(trace->depth || ftrace_graph_addr(trace->func)) || + ftrace_graph_ignore_irqs()) || + (max_depth && trace->depth >= max_depth)) return 0; local_irq_save(flags); @@ -1457,6 +1460,59 @@ static struct tracer graph_trace __read_mostly = { #endif }; + +static ssize_t +graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + max_depth = val; + + *ppos += cnt; + + return cnt; +} + +static ssize_t +graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/ + int n; + + n = sprintf(buf, "%d\n", max_depth); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static const struct file_operations graph_depth_fops = { + .open = tracing_open_generic, + .write = graph_depth_write, + .read = graph_depth_read, + .llseek = generic_file_llseek, +}; + +static __init int init_graph_debugfs(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + trace_create_file("max_graph_depth", 0644, d_tracer, + NULL, &graph_depth_fops); + + return 0; +} +fs_initcall(init_graph_debugfs); + static __init int init_graph_trace(void) { max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); -- cgit v1.2.3 From 03274a3ffb449632970fdd35da72ea41cf8474da Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Jan 2013 17:30:31 -0500 Subject: tracing/fgraph: Adjust fgraph depth before calling trace return callback While debugging the virtual cputime with the function graph tracer with a max_depth of 1 (most common use of the max_depth so far), I found that I was missing kernel execution because of a race condition. The code for the return side of the function has a slight race: ftrace_pop_return_trace(&trace, &ret, frame_pointer); trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); barrier(); current->curr_ret_stack--; The ftrace_pop_return_trace() initializes the trace structure for the callback. The ftrace_graph_return() uses the trace structure for its own use as that structure is on the stack and is local to this function. Then the curr_ret_stack is decremented which is what the trace.depth is set to. If an interrupt comes in after the ftrace_graph_return() but before the curr_ret_stack, then the called function will get a depth of 2. If max_depth is set to 1 this function will be ignored. The problem is that the trace has already been called, and the timestamp for that trace will not reflect the time the function was about to re-enter userspace. Calls to the interrupt will not be traced because the max_depth has prevented this. To solve this issue, the ftrace_graph_return() can safely be moved after the current->curr_ret_stack has been updated. This way the timestamp for the return callback will reflect the actual time. If an interrupt comes in after the curr_ret_stack update and ftrace_graph_return(), it will be traced. It may look a little confusing to see it within the other function, but at least it will not be lost. Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/trace/trace_functions_graph.c') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7008d2e13cf2..39ada66389cc 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -191,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) ftrace_pop_return_trace(&trace, &ret, frame_pointer); trace.rettime = trace_clock_local(); - ftrace_graph_return(&trace); barrier(); current->curr_ret_stack--; + /* + * The trace should run after decrementing the ret counter + * in case an interrupt were to come in. We don't want to + * lose the interrupt if max_depth is set. + */ + ftrace_graph_return(&trace); + if (unlikely(!ret)) { ftrace_graph_stop(); WARN_ON(1); -- cgit v1.2.3