From f81ab074c30234b07c8309c542cafd07bed721f7 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 16 Aug 2011 14:46:15 -0700 Subject: trace: Add a new readonly entry to report total buffer size The current file "buffer_size_kb" reports the size of per-cpu buffer and not the overall memory allocated which could be misleading. A new file "buffer_total_size_kb" adds up all the enabled CPU buffer sizes and reports it. This is only a readonly entry. Signed-off-by: Vaibhav Nagarnaik Cc: Michael Rubin Cc: David Sharp Link: http://lkml.kernel.org/r/1313531179-9323-2-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel/trace/trace.c') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e5df02c69b1d..01176788c387 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3568,6 +3568,30 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, return cnt; } +static ssize_t +tracing_total_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r, cpu; + unsigned long size = 0, expanded_size = 0; + + mutex_lock(&trace_types_lock); + for_each_tracing_cpu(cpu) { + size += tr->entries >> 10; + if (!ring_buffer_expanded) + expanded_size += trace_buf_size >> 10; + } + if (ring_buffer_expanded) + r = sprintf(buf, "%lu\n", size); + else + r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + static ssize_t tracing_free_buffer_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3739,6 +3763,12 @@ static const struct file_operations tracing_entries_fops = { .llseek = generic_file_llseek, }; +static const struct file_operations tracing_total_entries_fops = { + .open = tracing_open_generic, + .read = tracing_total_entries_read, + .llseek = generic_file_llseek, +}; + static const struct file_operations tracing_free_buffer_fops = { .write = tracing_free_buffer_write, .release = tracing_free_buffer_release, @@ -4450,6 +4480,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("buffer_size_kb", 0644, d_tracer, &global_trace, &tracing_entries_fops); + trace_create_file("buffer_total_size_kb", 0444, d_tracer, + &global_trace, &tracing_total_entries_fops); + trace_create_file("free_buffer", 0644, d_tracer, &global_trace, &tracing_free_buffer_fops); -- cgit v1.2.3 From c64e148a3be3cb786534ad38298c25c833116c26 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 16 Aug 2011 14:46:16 -0700 Subject: trace: Add ring buffer stats to measure rate of events The stats file under per_cpu folder provides the number of entries, overruns and other statistics about the CPU ring buffer. However, the numbers do not provide any indication of how full the ring buffer is in bytes compared to the overall size in bytes. Also, it is helpful to know the rate at which the cpu buffer is filling up. This patch adds an entry "bytes: " in printed stats for per_cpu ring buffer which provides the actual bytes consumed in the ring buffer. This field includes the number of bytes used by recorded events and the padding bytes added when moving the tail pointer to next page. It also adds the following time stamps: "oldest event ts:" - the oldest timestamp in the ring buffer "now ts:" - the timestamp at the time of reading The field "now ts" provides a consistent time snapshot to the userspace when being read. This is read from the same trace clock used by tracing event timestamps. Together, these values provide the rate at which the buffer is filling up, from the formula: bytes / (now_ts - oldest_event_ts) Signed-off-by: Vaibhav Nagarnaik Cc: Michael Rubin Cc: David Sharp Link: http://lkml.kernel.org/r/1313531179-9323-3-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 2 ++ kernel/trace/ring_buffer.c | 70 ++++++++++++++++++++++++++++++++++++++++++++- kernel/trace/trace.c | 13 +++++++++ 3 files changed, 84 insertions(+), 1 deletion(-) (limited to 'kernel/trace/trace.c') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index b891de96000f..67be0376d8e3 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -154,6 +154,8 @@ void ring_buffer_record_enable(struct ring_buffer *buffer); void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu); void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); +unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu); +unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_entries(struct ring_buffer *buffer); unsigned long ring_buffer_overruns(struct ring_buffer *buffer); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 731201bf4acc..acf6b68dc4a8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -488,12 +488,14 @@ struct ring_buffer_per_cpu { struct buffer_page *reader_page; unsigned long lost_events; unsigned long last_overrun; + local_t entries_bytes; local_t commit_overrun; local_t overrun; local_t entries; local_t committing; local_t commits; unsigned long read; + unsigned long read_bytes; u64 write_stamp; u64 read_stamp; }; @@ -1708,6 +1710,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, * the counters. */ local_add(entries, &cpu_buffer->overrun); + local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); /* * The entries will be zeroed out when we move the @@ -1863,6 +1866,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); + /* account for padding bytes */ + local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); + /* * Save the original length to the meta data. * This will be used by the reader to add lost event @@ -2054,6 +2060,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (!tail) tail_page->page->time_stamp = ts; + /* account for these added bytes */ + local_add(length, &cpu_buffer->entries_bytes); + return event; } @@ -2076,6 +2085,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; + unsigned long event_length = rb_event_length(event); /* * This is on the tail page. It is possible that * a write could come in and move the tail page @@ -2085,8 +2095,11 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, old_index += write_mask; new_index += write_mask; index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) + if (index == old_index) { + /* update counters */ + local_sub(event_length, &cpu_buffer->entries_bytes); return 1; + } } /* could not discard */ @@ -2660,6 +2673,58 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) (local_read(&cpu_buffer->overrun) + cpu_buffer->read); } +/** + * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) +{ + unsigned long flags; + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *bpage; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* + * if the tail is on reader_page, oldest time stamp is on the reader + * page + */ + if (cpu_buffer->tail_page == cpu_buffer->reader_page) + bpage = cpu_buffer->reader_page; + else + bpage = rb_set_head_page(cpu_buffer); + ret = bpage->page->time_stamp; + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); + +/** + * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); + /** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer * @buffer: The ring buffer @@ -3527,11 +3592,13 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read = 0; local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->entries_bytes, 0); local_set(&cpu_buffer->overrun, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); cpu_buffer->read = 0; + cpu_buffer->read_bytes = 0; cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; @@ -3918,6 +3985,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); + cpu_buffer->read_bytes += BUF_PAGE_SIZE; /* swap the pages */ rb_init_page(bpage); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 01176788c387..b41907000bc6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4056,6 +4056,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf, struct trace_array *tr = &global_trace; struct trace_seq *s; unsigned long cnt; + unsigned long long t; + unsigned long usec_rem; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) @@ -4072,6 +4074,17 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); + cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); + trace_seq_printf(s, "bytes: %ld\n", cnt); + + t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); + + t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); kfree(s); -- cgit v1.2.3 From 5389f6fad27019f2ba78f1b332f719ec05f12a42 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Jul 2009 17:13:33 +0200 Subject: locking, tracing: Annotate tracing locks as raw The tracing locks can be taken in atomic context and therefore cannot be preempted on -rt - annotate it. In mainline this change documents the low level nature of the lock - otherwise there's no functional difference. Lockdep and Sparse checking will work as usual. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 52 ++++++++++++++++++++++---------------------- kernel/trace/trace.c | 10 ++++----- kernel/trace/trace_irqsoff.c | 6 ++--- 3 files changed, 34 insertions(+), 34 deletions(-) (limited to 'kernel/trace/trace.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 731201bf4acc..f2f821acc597 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -478,7 +478,7 @@ struct ring_buffer_per_cpu { int cpu; atomic_t record_disabled; struct ring_buffer *buffer; - spinlock_t reader_lock; /* serialize readers */ + raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; struct list_head *pages; @@ -1062,7 +1062,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; - spin_lock_init(&cpu_buffer->reader_lock); + raw_spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -1259,7 +1259,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) struct list_head *p; unsigned i; - spin_lock_irq(&cpu_buffer->reader_lock); + raw_spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1277,7 +1277,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) rb_check_pages(cpu_buffer); out: - spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); } static void @@ -1288,7 +1288,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *p; unsigned i; - spin_lock_irq(&cpu_buffer->reader_lock); + raw_spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1303,7 +1303,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_check_pages(cpu_buffer); out: - spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); } /** @@ -2804,9 +2804,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_iter_reset(iter); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); @@ -3265,12 +3265,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, again: local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) @@ -3295,9 +3295,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) unsigned long flags; again: - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; @@ -3337,7 +3337,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event) { @@ -3346,7 +3346,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, } if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); out: @@ -3438,11 +3438,11 @@ ring_buffer_read_start(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -3477,7 +3477,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); again: event = rb_iter_peek(iter, ts); if (!event) @@ -3488,7 +3488,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) rb_advance_iter(iter); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return event; } @@ -3557,7 +3557,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) goto out; @@ -3569,7 +3569,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) arch_spin_unlock(&cpu_buffer->lock); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); } @@ -3607,10 +3607,10 @@ int ring_buffer_empty(struct ring_buffer *buffer) cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); if (!ret) @@ -3641,10 +3641,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); return ret; @@ -3841,7 +3841,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, if (!bpage) goto out; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); reader = rb_get_reader_page(cpu_buffer); if (!reader) @@ -3964,7 +3964,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); out_unlock: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); out: return ret; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e5df02c69b1d..0c8bdeeb358b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -341,7 +341,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; static int trace_stop_count; -static DEFINE_SPINLOCK(tracing_start_lock); +static DEFINE_RAW_SPINLOCK(tracing_start_lock); static void wakeup_work_handler(struct work_struct *work) { @@ -960,7 +960,7 @@ void tracing_start(void) if (tracing_disabled) return; - spin_lock_irqsave(&tracing_start_lock, flags); + raw_spin_lock_irqsave(&tracing_start_lock, flags); if (--trace_stop_count) { if (trace_stop_count < 0) { /* Someone screwed up their debugging */ @@ -985,7 +985,7 @@ void tracing_start(void) ftrace_start(); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&tracing_start_lock, flags); } /** @@ -1000,7 +1000,7 @@ void tracing_stop(void) unsigned long flags; ftrace_stop(); - spin_lock_irqsave(&tracing_start_lock, flags); + raw_spin_lock_irqsave(&tracing_start_lock, flags); if (trace_stop_count++) goto out; @@ -1018,7 +1018,7 @@ void tracing_stop(void) arch_spin_unlock(&ftrace_max_lock); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&tracing_start_lock, flags); } void trace_stop_cmdline_recording(void); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 667aa8cc0cfc..11186212068c 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -23,7 +23,7 @@ static int tracer_enabled __read_mostly; static DEFINE_PER_CPU(int, tracing_cpu); -static DEFINE_SPINLOCK(max_trace_lock); +static DEFINE_RAW_SPINLOCK(max_trace_lock); enum { TRACER_IRQS_OFF = (1 << 1), @@ -321,7 +321,7 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out; - spin_lock_irqsave(&max_trace_lock, flags); + raw_spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ if (!report_latency(delta)) @@ -344,7 +344,7 @@ check_critical_timing(struct trace_array *tr, max_sequence++; out_unlock: - spin_unlock_irqrestore(&max_trace_lock, flags); + raw_spin_unlock_irqrestore(&max_trace_lock, flags); out: data->critical_sequence = max_sequence; -- cgit v1.2.3 From 6249687f76b69cc0b2ad34636f4a18d693ef3262 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Sep 2011 11:35:58 -0400 Subject: tracing: Add a counter clock for those that do not trust clocks When debugging tight race conditions, it can be helpful to have a synchronized tracing method. Although in most cases the global clock provides this functionality, if timings is not the issue, it is more comforting to know that the order of events really happened in a precise order. Instead of using a clock, add a "counter" that is simply an incrementing atomic 64bit counter that orders the events as they are perceived to happen. The trace_clock_counter() is added from the attempt by Peter Zijlstra trying to convert the trace_clock_global() to it. I took Peter's counter code and made trace_clock_counter() instead, and added it to the choice of clocks. Just echo counter > /debug/tracing/trace_clock to activate it. Requested-by: Thomas Gleixner Requested-by: Peter Zijlstra Reviewed-By: Valdis Kletnieks Signed-off-by: Steven Rostedt --- include/linux/trace_clock.h | 1 + kernel/trace/trace.c | 1 + kernel/trace/trace_clock.c | 12 ++++++++++++ 3 files changed, 14 insertions(+) (limited to 'kernel/trace/trace.c') diff --git a/include/linux/trace_clock.h b/include/linux/trace_clock.h index 7a8130384087..4eb490237d4c 100644 --- a/include/linux/trace_clock.h +++ b/include/linux/trace_clock.h @@ -15,5 +15,6 @@ extern u64 notrace trace_clock_local(void); extern u64 notrace trace_clock(void); extern u64 notrace trace_clock_global(void); +extern u64 notrace trace_clock_counter(void); #endif /* _LINUX_TRACE_CLOCK_H */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b41907000bc6..4b8df0dc9358 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -435,6 +435,7 @@ static struct { } trace_clocks[] = { { trace_clock_local, "local" }, { trace_clock_global, "global" }, + { trace_clock_counter, "counter" }, }; int trace_clock_id; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 6302747a1398..394783531cbb 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -113,3 +113,15 @@ u64 notrace trace_clock_global(void) return now; } + +static atomic64_t trace_counter; + +/* + * trace_clock_counter(): simply an atomic counter. + * Use the trace_counter "counter" for cases where you do not care + * about timings, but are interested in strict ordering. + */ +u64 notrace trace_clock_counter(void) +{ + return atomic64_add_return(1, &trace_counter); +} -- cgit v1.2.3 From e0a413f619ef8bc366dafc6f8221674993b8d85f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 29 Sep 2011 21:26:16 -0400 Subject: tracing: Warn on output if the function tracer was found corrupted As the function tracer is very intrusive, lots of self checks are performed on the tracer and if something is found to be strange it will shut itself down keeping it from corrupting the rest of the kernel. This shutdown may still allow functions to be traced, as the tracing only stops new modifications from happening. Trying to stop the function tracer itself can cause more harm as it requires code modification. Although a WARN_ON() is executed, a user may not notice it. To help the user see that something isn't right with the tracing of the system a big warning is added to the output of the tracer that lets the user know that their data may be incomplete. Reported-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++++++++ kernel/trace/trace.c | 15 +++++++++++++++ kernel/trace/trace.h | 2 ++ 3 files changed, 25 insertions(+) (limited to 'kernel/trace/trace.c') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c3e4575e7829..077d85387908 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3862,6 +3862,14 @@ void ftrace_kill(void) clear_ftrace_function(); } +/** + * Test if ftrace is dead or not. + */ +int ftrace_is_dead(void) +{ + return ftrace_disabled; +} + /** * register_ftrace_function - register a function for profiling * @ops - ops structure that holds the function for profiling. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4b8df0dc9358..13f2b8472fed 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2160,6 +2160,14 @@ void trace_default_header(struct seq_file *m) } } +static void test_ftrace_alive(struct seq_file *m) +{ + if (!ftrace_is_dead()) + return; + seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); +} + static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -2169,6 +2177,7 @@ static int s_show(struct seq_file *m, void *v) if (iter->tr) { seq_printf(m, "# tracer: %s\n", iter->trace->name); seq_puts(m, "#\n"); + test_ftrace_alive(m); } if (iter->trace && iter->trace->print_header) iter->trace->print_header(m); @@ -4613,6 +4622,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) tracing_off(); + /* Did function tracer already get disabled? */ + if (ftrace_is_dead()) { + printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + printk("# MAY BE MISSING FUNCTION EVENTS\n"); + } + if (disable_tracing) ftrace_kill(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4c7540ad5dcb..092e1f8d18dc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -579,11 +579,13 @@ static inline int ftrace_trace_task(struct task_struct *task) return test_tsk_trace_trace(task); } +extern int ftrace_is_dead(void); #else static inline int ftrace_trace_task(struct task_struct *task) { return 1; } +static inline int ftrace_is_dead(void) { return 0; } #endif /* -- cgit v1.2.3 From d696b58ca2c3ca76e784ef89a7e0453d9b7ab187 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Sep 2011 11:50:27 -0400 Subject: tracing: Do not allocate buffer for trace_marker When doing intense tracing, the kmalloc inside trace_marker can introduce side effects to what is being traced. As trace_marker() is used by userspace to inject data into the kernel ring buffer, it needs to do so with the least amount of intrusion to the operations of the kernel or the user space application. As the ring buffer is designed to write directly into the buffer without the need to make a temporary buffer, and userspace already went through the hassle of knowing how big the write will be, we can simply pin the userspace pages and write the data directly into the buffer. This improves the impact of tracing via trace_marker tremendously! Thanks to Peter Zijlstra and Thomas Gleixner for pointing out the use of get_user_pages_fast() and kmap_atomic(). Suggested-by: Thomas Gleixner Suggested-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 111 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 28 deletions(-) (limited to 'kernel/trace/trace.c') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 13f2b8472fed..f86efe90ca45 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3628,22 +3628,24 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) return 0; } -static int mark_printk(const char *fmt, ...) -{ - int ret; - va_list args; - va_start(args, fmt); - ret = trace_vprintk(0, fmt, args); - va_end(args); - return ret; -} - static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { - char *buf; - size_t written; + unsigned long addr = (unsigned long)ubuf; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct print_entry *entry; + unsigned long irq_flags; + struct page *pages[2]; + int nr_pages = 1; + ssize_t written; + void *page1; + void *page2; + int offset; + int size; + int len; + int ret; if (tracing_disabled) return -EINVAL; @@ -3651,28 +3653,81 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; - buf = kmalloc(cnt + 2, GFP_KERNEL); - if (buf == NULL) - return -ENOMEM; + /* + * Userspace is injecting traces into the kernel trace buffer. + * We want to be as non intrusive as possible. + * To do so, we do not want to allocate any special buffers + * or take any locks, but instead write the userspace data + * straight into the ring buffer. + * + * First we need to pin the userspace buffer into memory, + * which, most likely it is, because it just referenced it. + * But there's no guarantee that it is. By using get_user_pages_fast() + * and kmap_atomic/kunmap_atomic() we can get access to the + * pages directly. We then write the data directly into the + * ring buffer. + */ + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - if (copy_from_user(buf, ubuf, cnt)) { - kfree(buf); - return -EFAULT; + /* check if we cross pages */ + if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) + nr_pages = 2; + + offset = addr & (PAGE_SIZE - 1); + addr &= PAGE_MASK; + + ret = get_user_pages_fast(addr, nr_pages, 0, pages); + if (ret < nr_pages) { + while (--ret >= 0) + put_page(pages[ret]); + written = -EFAULT; + goto out; } - if (buf[cnt-1] != '\n') { - buf[cnt] = '\n'; - buf[cnt+1] = '\0'; + + page1 = kmap_atomic(pages[0]); + if (nr_pages == 2) + page2 = kmap_atomic(pages[1]); + + local_save_flags(irq_flags); + size = sizeof(*entry) + cnt + 2; /* possible \n added */ + buffer = global_trace.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + irq_flags, preempt_count()); + if (!event) { + /* Ring buffer disabled, return as if not open for write */ + written = -EBADF; + goto out_unlock; + } + + entry = ring_buffer_event_data(event); + entry->ip = _THIS_IP_; + + if (nr_pages == 2) { + len = PAGE_SIZE - offset; + memcpy(&entry->buf, page1 + offset, len); + memcpy(&entry->buf[len], page2, cnt - len); } else - buf[cnt] = '\0'; + memcpy(&entry->buf, page1 + offset, cnt); - written = mark_printk("%s", buf); - kfree(buf); - *fpos += written; + if (entry->buf[cnt - 1] != '\n') { + entry->buf[cnt] = '\n'; + entry->buf[cnt + 1] = '\0'; + } else + entry->buf[cnt] = '\0'; + + ring_buffer_unlock_commit(buffer, event); - /* don't tell userspace we wrote more - it might confuse them */ - if (written > cnt) - written = cnt; + written = cnt; + *fpos += written; + + out_unlock: + if (nr_pages == 2) + kunmap_atomic(page2); + kunmap_atomic(page1); + while (nr_pages > 0) + put_page(pages[--nr_pages]); + out: return written; } -- cgit v1.2.3 From 9b5f8b31af57a8ce9e9f77864d9143b5e3304815 Mon Sep 17 00:00:00 2001 From: Geunsik Lim Date: Fri, 12 Aug 2011 14:30:22 +0900 Subject: ftrace: Fix README to state tracing_on to start/stop tracing tracing_enabled option is deprecated. To start/stop tracing, write to /sys/kernel/debug/tracing/tracing_on without tracing_enabled. This patch is based on Linux 3.1.0-rc1 Signed-off-by: Geunsik Lim Link: http://lkml.kernel.org/r/1313127022-23830-1-git-send-email-leemgs1@gmail.com Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace/trace.c') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f86efe90ca45..cea16053f553 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2720,9 +2720,9 @@ static const char readme_msg[] = "# cat /sys/kernel/debug/tracing/trace_options\n" "noprint-parent nosym-offset nosym-addr noverbose\n" "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" - "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n" + "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" - "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n" + "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" ; static ssize_t -- cgit v1.2.3 From 436fc280261dcfce5af38f08b89287750dc91cd2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Oct 2011 10:44:25 -0400 Subject: tracing: Fix returning of duplicate data after EOF in trace_pipe_raw The trace_pipe_raw handler holds a cached page from the time the file is opened to the time it is closed. The cached page is used to handle the case of the user space buffer being smaller than what was read from the ring buffer. The left over buffer is held in the cache so that the next read will continue where the data left off. After EOF is returned (no more data in the buffer), the index of the cached page is set to zero. If a user app reads the page again after EOF, the check in the buffer will see that the cached page is less than page size and will return the cached page again. This will cause reading the trace_pipe_raw again after EOF to return duplicate data, making the output look like the time went backwards but instead data is just repeated. The fix is to not reset the index right after all data is read from the cache, but to reset it after all data is read and more data exists in the ring buffer. Cc: stable Reported-by: Jeremy Eder Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace/trace.c') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cea16053f553..b24a72d35008 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3903,8 +3903,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (info->read < PAGE_SIZE) goto read; - info->read = 0; - trace_access_lock(info->cpu); ret = ring_buffer_read_page(info->tr->buffer, &info->spare, @@ -3914,6 +3912,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) return 0; + info->read = 0; + read: size = PAGE_SIZE - info->read; if (size > count) -- cgit v1.2.3