From 0f67f04ffcb592d065a20862a82d4539e0f8e909 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 3 Feb 2015 11:56:20 -0500 Subject: tracing: Only create tracer options files if directory exists Do not bother creating tracer options if no tracing directory exists. If a tracer is enabled via the command line, and is started before the tracing directory is created, then it wont have its tracer specific options created. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 38c613ede10d..d4627f15407a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4172,8 +4172,11 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) free_snapshot(tr); } #endif - /* Currently, only the top instance has options */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + /* + * Only enable if the directory has been created already. + * Currently, only the top instance has options + */ + if (tr->dir && tr->flags & TRACE_ARRAY_FL_GLOBAL) { destroy_trace_option_files(topts); topts = create_trace_option_files(tr, t); } -- cgit v1.2.3 From 09d23a1d8a82e814bd56a4f121b80ea8214ac49d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 3 Feb 2015 12:45:53 -0500 Subject: tracing: Create cmdline tracer options on tracing fs init The options for cmdline tracers are not created if the debugfs system is not ready yet. If tracing has started before debugfs is up, then the option files for the tracer are not created. Create them when creating the tracing directory if the current tracer requires option files. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d4627f15407a..05e0e50539fc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4105,9 +4105,24 @@ static void tracing_set_nop(struct trace_array *tr) tr->current_trace = &nop_trace; } -static int tracing_set_tracer(struct trace_array *tr, const char *buf) +static void update_tracer_options(struct trace_array *tr, struct tracer *t) { static struct trace_option_dentry *topts; + + /* Only enable if the directory has been created already. */ + if (!tr->dir) + return; + + /* Currently, only the top instance has options */ + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) + return; + + destroy_trace_option_files(topts); + topts = create_trace_option_files(tr, t); +} + +static int tracing_set_tracer(struct trace_array *tr, const char *buf) +{ struct tracer *t; #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; @@ -4172,14 +4187,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) free_snapshot(tr); } #endif - /* - * Only enable if the directory has been created already. - * Currently, only the top instance has options - */ - if (tr->dir && tr->flags & TRACE_ARRAY_FL_GLOBAL) { - destroy_trace_option_files(topts); - topts = create_trace_option_files(tr, t); - } + update_tracer_options(tr, t); #ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { @@ -6578,6 +6586,10 @@ static __init int tracer_init_debugfs(void) create_trace_options_dir(&global_trace); + /* If the tracer was started via cmdline, create options for it here */ + if (global_trace.current_trace != &nop_trace) + update_tracer_options(&global_trace, global_trace.current_trace); + return 0; } -- cgit v1.2.3 From 8434dc9340cd2e117fc944cf7526263bf490a52a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 20 Jan 2015 12:13:40 -0500 Subject: tracing: Convert the tracing facility over to use tracefs debugfs was fine for the tracing facility as a quick way to get an interface. Now that tracing has matured, it should separate itself from debugfs such that it can be mounted separately without needing to mount all of debugfs with it. That is, users resist using tracing because it requires mounting debugfs. Having tracing have its own file system lets users get the features of tracing without needing to bring in the rest of the kernel's debug infrastructure. Another reason for tracefs is that debubfs does not support mkdir. Currently, to create instances, one does a mkdir in the tracing/instance directory. This is implemented via a hack that forces debugfs to do something it is not intended on doing. By converting over to tracefs, this hack can be removed and mkdir can be properly implemented. This patch does not address this yet, but it lays the ground work for that to be done. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 22 +++++++-------- kernel/trace/trace.c | 55 +++++++++++++++++++++--------------- kernel/trace/trace.h | 2 +- kernel/trace/trace_events.c | 32 ++++++++++----------- kernel/trace/trace_functions_graph.c | 7 ++--- kernel/trace/trace_kprobe.c | 10 +++---- kernel/trace/trace_probe.h | 2 +- kernel/trace/trace_stat.c | 10 +++---- 8 files changed, 74 insertions(+), 66 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 45e5cb143d17..fcc0e7052a79 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -1008,7 +1008,7 @@ static struct tracer_stat function_stats __initdata = { .stat_show = function_stat_show }; -static __init void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_tracefs(struct dentry *d_tracer) { struct ftrace_profile_stat *stat; struct dentry *entry; @@ -1044,15 +1044,15 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) } } - entry = debugfs_create_file("function_profile_enabled", 0644, + entry = tracefs_create_file("function_profile_enabled", 0644, d_tracer, NULL, &ftrace_profile_fops); if (!entry) - pr_warning("Could not create debugfs " + pr_warning("Could not create tracefs " "'function_profile_enabled' entry\n"); } #else /* CONFIG_FUNCTION_PROFILER */ -static __init void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_tracefs(struct dentry *d_tracer) { } #endif /* CONFIG_FUNCTION_PROFILER */ @@ -4690,7 +4690,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops) mutex_unlock(&ftrace_lock); } -static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) +static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { trace_create_file("available_filter_functions", 0444, @@ -4998,7 +4998,7 @@ static int __init ftrace_nodyn_init(void) } core_initcall(ftrace_nodyn_init); -static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } +static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } static inline void ftrace_startup_all(int command) { } /* Keep as macros so we do not need to define the commands */ @@ -5451,7 +5451,7 @@ static const struct file_operations ftrace_pid_fops = { .release = ftrace_pid_release, }; -static __init int ftrace_init_debugfs(void) +static __init int ftrace_init_tracefs(void) { struct dentry *d_tracer; @@ -5459,16 +5459,16 @@ static __init int ftrace_init_debugfs(void) if (IS_ERR(d_tracer)) return 0; - ftrace_init_dyn_debugfs(d_tracer); + ftrace_init_dyn_tracefs(d_tracer); trace_create_file("set_ftrace_pid", 0644, d_tracer, NULL, &ftrace_pid_fops); - ftrace_profile_debugfs(d_tracer); + ftrace_profile_tracefs(d_tracer); return 0; } -fs_initcall(ftrace_init_debugfs); +fs_initcall(ftrace_init_tracefs); /** * ftrace_kill - kill ftrace diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 05e0e50539fc..6c4739bee4bb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -5828,6 +5829,14 @@ static inline __init int register_snapshot_cmd(void) { return 0; } static struct dentry *tracing_get_dentry(struct trace_array *tr) { + if (WARN_ON(!tr->dir)) + return ERR_PTR(-ENODEV); + + /* Top directory uses NULL as the parent */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return NULL; + + /* All sub buffers have a descriptor */ return tr->dir; } @@ -5842,10 +5851,10 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) if (IS_ERR(d_tracer)) return NULL; - tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); + tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer); WARN_ONCE(!tr->percpu_dir, - "Could not create debugfs directory 'per_cpu/%d'\n", cpu); + "Could not create tracefs directory 'per_cpu/%d'\n", cpu); return tr->percpu_dir; } @@ -5862,7 +5871,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, } static void -tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) +tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); struct dentry *d_cpu; @@ -5872,9 +5881,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) return; snprintf(cpu_dir, 30, "cpu%ld", cpu); - d_cpu = debugfs_create_dir(cpu_dir, d_percpu); + d_cpu = tracefs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { - pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); + pr_warning("Could not create tracefs '%s' entry\n", cpu_dir); return; } @@ -6026,9 +6035,9 @@ struct dentry *trace_create_file(const char *name, { struct dentry *ret; - ret = debugfs_create_file(name, mode, parent, data, fops); + ret = tracefs_create_file(name, mode, parent, data, fops); if (!ret) - pr_warning("Could not create debugfs '%s' entry\n", name); + pr_warning("Could not create tracefs '%s' entry\n", name); return ret; } @@ -6045,9 +6054,9 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) if (IS_ERR(d_tracer)) return NULL; - tr->options = debugfs_create_dir("options", d_tracer); + tr->options = tracefs_create_dir("options", d_tracer); if (!tr->options) { - pr_warning("Could not create debugfs directory 'options'\n"); + pr_warning("Could not create tracefs directory 'options'\n"); return NULL; } @@ -6116,7 +6125,7 @@ destroy_trace_option_files(struct trace_option_dentry *topts) return; for (cnt = 0; topts[cnt].opt; cnt++) - debugfs_remove(topts[cnt].entry); + tracefs_remove(topts[cnt].entry); kfree(topts); } @@ -6205,7 +6214,7 @@ static const struct file_operations rb_simple_fops = { struct dentry *trace_instance_dir; static void -init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); +init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); static int allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) @@ -6321,17 +6330,17 @@ static int new_instance_create(const char *name) if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; - tr->dir = debugfs_create_dir(name, trace_instance_dir); + tr->dir = tracefs_create_dir(name, trace_instance_dir); if (!tr->dir) goto out_free_tr; ret = event_trace_add_tracer(tr->dir, tr); if (ret) { - debugfs_remove_recursive(tr->dir); + tracefs_remove_recursive(tr->dir); goto out_free_tr; } - init_tracer_debugfs(tr, tr->dir); + init_tracer_tracefs(tr, tr->dir); list_add(&tr->list, &ftrace_trace_arrays); @@ -6404,7 +6413,7 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m return -ENOENT; /* - * The inode mutex is locked, but debugfs_create_dir() will also + * The inode mutex is locked, but tracefs_create_dir() will also * take the mutex. As the instances directory can not be destroyed * or changed in any other way, it is safe to unlock it, and * let the dentry try. If two users try to make the same dir at @@ -6434,7 +6443,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry) mutex_unlock(&dentry->d_inode->i_mutex); /* - * The inode mutex is locked, but debugfs_create_dir() will also + * The inode mutex is locked, but tracefs_create_dir() will also * take the mutex. As the instances directory can not be destroyed * or changed in any other way, it is safe to unlock it, and * let the dentry try. If two users try to make the same dir at @@ -6459,7 +6468,7 @@ static const struct inode_operations instance_dir_inode_operations = { static __init void create_trace_instances(struct dentry *d_tracer) { - trace_instance_dir = debugfs_create_dir("instances", d_tracer); + trace_instance_dir = tracefs_create_dir("instances", d_tracer); if (WARN_ON(!trace_instance_dir)) return; @@ -6468,7 +6477,7 @@ static __init void create_trace_instances(struct dentry *d_tracer) } static void -init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) +init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) { int cpu; @@ -6522,7 +6531,7 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) #endif for_each_tracing_cpu(cpu) - tracing_init_debugfs_percpu(tr, cpu); + tracing_init_tracefs_percpu(tr, cpu); } @@ -6550,10 +6559,10 @@ struct dentry *tracing_init_dentry(void) return ERR_PTR(-ENOMEM); } - return tr->dir; + return NULL; } -static __init int tracer_init_debugfs(void) +static __init int tracer_init_tracefs(void) { struct dentry *d_tracer; @@ -6563,7 +6572,7 @@ static __init int tracer_init_debugfs(void) if (IS_ERR(d_tracer)) return 0; - init_tracer_debugfs(&global_trace, d_tracer); + init_tracer_tracefs(&global_trace, d_tracer); trace_create_file("tracing_thresh", 0644, d_tracer, &global_trace, &tracing_thresh_fops); @@ -6925,5 +6934,5 @@ __init static int clear_boot_tracer(void) return 0; } -fs_initcall(tracer_init_debugfs); +fs_initcall(tracer_init_tracefs); late_initcall(clear_boot_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dd8205a35760..d951deddec89 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -334,7 +334,7 @@ struct tracer_flags { /** - * struct tracer - a specific tracer and its callbacks to interact with debugfs + * struct tracer - a specific tracer and its callbacks to interact with tracefs * @name: the name chosen to select it on the available_tracers file * @init: called when one switches to this tracer (echo name > current_tracer) * @reset: called when one switches to another tracer diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index db54dda10ccc..0d2e47370ee7 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include @@ -480,7 +480,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir) return; if (!--dir->nr_events) { - debugfs_remove_recursive(dir->entry); + tracefs_remove_recursive(dir->entry); list_del(&dir->list); __put_system_dir(dir); } @@ -499,7 +499,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file) } spin_unlock(&dir->d_lock); - debugfs_remove_recursive(dir); + tracefs_remove_recursive(dir); } list_del(&file->list); @@ -1526,7 +1526,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } else __get_system(system); - dir->entry = debugfs_create_dir(name, parent); + dir->entry = tracefs_create_dir(name, parent); if (!dir->entry) { pr_warn("Failed to create system directory %s\n", name); __put_system(system); @@ -1539,12 +1539,12 @@ event_subsystem_dir(struct trace_array *tr, const char *name, dir->subsystem = system; file->system = dir; - entry = debugfs_create_file("filter", 0644, dir->entry, dir, + entry = tracefs_create_file("filter", 0644, dir->entry, dir, &ftrace_subsystem_filter_fops); if (!entry) { kfree(system->filter); system->filter = NULL; - pr_warn("Could not create debugfs '%s/filter' entry\n", name); + pr_warn("Could not create tracefs '%s/filter' entry\n", name); } trace_create_file("enable", 0644, dir->entry, dir, @@ -1585,9 +1585,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) d_events = parent; name = ftrace_event_name(call); - file->dir = debugfs_create_dir(name, d_events); + file->dir = tracefs_create_dir(name, d_events); if (!file->dir) { - pr_warn("Could not create debugfs '%s' directory\n", name); + pr_warn("Could not create tracefs '%s' directory\n", name); return -1; } @@ -2228,7 +2228,7 @@ static inline int register_event_cmds(void) { return 0; } /* * The top level array has already had its ftrace_event_file * descriptors created in order to allow for early events to - * be recorded. This function is called after the debugfs has been + * be recorded. This function is called after the tracefs has been * initialized, and we now have to create the files associated * to the events. */ @@ -2311,16 +2311,16 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) struct dentry *d_events; struct dentry *entry; - entry = debugfs_create_file("set_event", 0644, parent, + entry = tracefs_create_file("set_event", 0644, parent, tr, &ftrace_set_event_fops); if (!entry) { - pr_warn("Could not create debugfs 'set_event' entry\n"); + pr_warn("Could not create tracefs 'set_event' entry\n"); return -ENOMEM; } - d_events = debugfs_create_dir("events", parent); + d_events = tracefs_create_dir("events", parent); if (!d_events) { - pr_warn("Could not create debugfs 'events' directory\n"); + pr_warn("Could not create tracefs 'events' directory\n"); return -ENOMEM; } @@ -2412,7 +2412,7 @@ int event_trace_del_tracer(struct trace_array *tr) down_write(&trace_event_sem); __trace_remove_event_dirs(tr); - debugfs_remove_recursive(tr->event_dir); + tracefs_remove_recursive(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; @@ -2534,10 +2534,10 @@ static __init int event_trace_init(void) if (IS_ERR(d_tracer)) return 0; - entry = debugfs_create_file("available_events", 0444, d_tracer, + entry = tracefs_create_file("available_events", 0444, d_tracer, tr, &ftrace_avail_fops); if (!entry) - pr_warn("Could not create debugfs 'available_events' entry\n"); + pr_warn("Could not create tracefs 'available_events' entry\n"); if (trace_define_common_fields()) pr_warn("tracing: Failed to allocate common fields"); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 2d25ad1526bb..9cfea4c6d314 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -6,7 +6,6 @@ * is Copyright (c) Steven Rostedt * */ -#include #include #include #include @@ -151,7 +150,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, * The curr_ret_stack is initialized to -1 and get increased * in this function. So it can be less than -1 only if it was * filtered out via ftrace_graph_notrace_addr() which can be - * set from set_graph_notrace file in debugfs by user. + * set from set_graph_notrace file in tracefs by user. */ if (current->curr_ret_stack < -1) return -EBUSY; @@ -1432,7 +1431,7 @@ static const struct file_operations graph_depth_fops = { .llseek = generic_file_llseek, }; -static __init int init_graph_debugfs(void) +static __init int init_graph_tracefs(void) { struct dentry *d_tracer; @@ -1445,7 +1444,7 @@ static __init int init_graph_debugfs(void) return 0; } -fs_initcall(init_graph_debugfs); +fs_initcall(init_graph_tracefs); static __init int init_graph_trace(void) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b4a00def88f5..c1c6655847c8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1310,7 +1310,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) return ret; } -/* Make a debugfs interface for controlling probe points */ +/* Make a tracefs interface for controlling probe points */ static __init int init_kprobe_trace(void) { struct dentry *d_tracer; @@ -1323,20 +1323,20 @@ static __init int init_kprobe_trace(void) if (IS_ERR(d_tracer)) return 0; - entry = debugfs_create_file("kprobe_events", 0644, d_tracer, + entry = tracefs_create_file("kprobe_events", 0644, d_tracer, NULL, &kprobe_events_ops); /* Event list interface */ if (!entry) - pr_warning("Could not create debugfs " + pr_warning("Could not create tracefs " "'kprobe_events' entry\n"); /* Profile interface */ - entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, + entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, NULL, &kprobe_profile_ops); if (!entry) - pr_warning("Could not create debugfs " + pr_warning("Could not create tracefs " "'kprobe_profile' entry\n"); return 0; } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 4f815fbce16d..19aff635841a 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 75e19e86c954..6cf935316769 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include "trace_stat.h" #include "trace.h" @@ -65,7 +65,7 @@ static void reset_stat_session(struct stat_session *session) static void destroy_session(struct stat_session *session) { - debugfs_remove(session->file); + tracefs_remove(session->file); __reset_stat_session(session); mutex_destroy(&session->stat_mutex); kfree(session); @@ -279,9 +279,9 @@ static int tracing_stat_init(void) if (IS_ERR(d_tracing)) return 0; - stat_dir = debugfs_create_dir("trace_stat", d_tracing); + stat_dir = tracefs_create_dir("trace_stat", d_tracing); if (!stat_dir) - pr_warning("Could not create debugfs " + pr_warning("Could not create tracefs " "'trace_stat' entry\n"); return 0; } @@ -291,7 +291,7 @@ static int init_stat_file(struct stat_session *session) if (!stat_dir && tracing_stat_init()) return -ENODEV; - session->file = debugfs_create_file(session->ts->name, 0644, + session->file = tracefs_create_file(session->ts->name, 0644, stat_dir, session, &tracing_stat_fops); if (!session->file) -- cgit v1.2.3 From f76180bc07abc399977bfbe8c43bf58c4570e893 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 20 Jan 2015 15:48:46 -0500 Subject: tracing: Automatically mount tracefs on debugfs/tracing As tools currently rely on the tracing directory in debugfs, we can not just created a tracefs infrastructure and expect sysadmins to mount the new tracefs to have their old tools work. Instead, the debugfs tracing directory is still created and the tracefs file system is mounted there when the debugfs filesystem is mounted. No longer does the tracing infrastructure update the debugfs file system, but instead interacts with the tracefs file system. But now, it still appears to the user like nothing changed, except you also have the feature of mounting just the tracing system without needing all of debugfs! Cc: Al Viro Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6c4739bee4bb..b4aa936509d2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -6535,6 +6536,28 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) } +static struct vfsmount *trace_automount(void *ingore) +{ + struct vfsmount *mnt; + struct file_system_type *type; + + /* + * To maintain backward compatibility for tools that mount + * debugfs to get to the tracing facility, tracefs is automatically + * mounted to the debugfs/tracing directory. + */ + type = get_fs_type("tracefs"); + if (!type) + return NULL; + mnt = vfs_kern_mount(type, 0, "tracefs", NULL); + put_filesystem(type); + if (IS_ERR(mnt)) + return NULL; + mntget(mnt); + + return mnt; +} + /** * tracing_init_dentry - initialize top level trace array * @@ -6546,14 +6569,21 @@ struct dentry *tracing_init_dentry(void) { struct trace_array *tr = &global_trace; + /* The top level trace array uses NULL as parent */ if (tr->dir) - return tr->dir; + return NULL; if (WARN_ON(!debugfs_initialized())) return ERR_PTR(-ENODEV); - tr->dir = debugfs_create_dir("tracing", NULL); - + /* + * As there may still be users that expect the tracing + * files to exist in debugfs/tracing, we must automount + * the tracefs file system there, so older tools still + * work with the newer kerenl. + */ + tr->dir = debugfs_create_automount("tracing", NULL, + trace_automount, NULL); if (!tr->dir) { pr_warn_once("Could not create debugfs directory 'tracing'\n"); return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From eae473581cf93dad94ca833aa961c033c6a43924 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 21 Jan 2015 10:01:39 -0500 Subject: tracing: Have mkdir and rmdir be part of tracefs The tracing "instances" directory can create sub tracing buffers with mkdir, and remove them with rmdir. As a mkdir will also create all the files and directories that control the sub buffer the inode mutexes need to be released before this is done, to avoid deadlocks. It is better to let the tracing system unlock the inode mutexes before calling the functions that create the files within the new directory (or deletes the files from the one being destroyed). Now that tracing has been converted over to tracefs, the tracefs file system can be modified to accommodate this feature. It still releases the locks, but the filesystem itself can take care of the ugly business and let the user just do what it needs. The tracing system now attaches a descriptor to the directory dentry that can have userspace create or remove sub directories. If this descriptor does not exist for a dentry, then that dentry can not be used to create other directories. This descriptor holds a mkdir and rmdir method that only takes a character string as an argument. The tracefs file system will first make a copy of the dentry name before releasing the locks. Then it will pass the copied name to the methods. It is up to the tracing system that supplied the methods to handle races with duplicate names and such as all the inode mutexes would be released when the functions are called. Cc: Al Viro Signed-off-by: Steven Rostedt --- fs/tracefs/inode.c | 151 +++++++++++++++++++++++++++++++++++++++++++----- include/linux/tracefs.h | 4 ++ kernel/trace/trace.c | 75 ++---------------------- 3 files changed, 145 insertions(+), 85 deletions(-) (limited to 'kernel/trace') diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 0b9cf5cf24c9..d92bdf3b079a 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -50,6 +50,84 @@ static const struct file_operations tracefs_file_operations = { .llseek = noop_llseek, }; +static struct tracefs_dir_ops { + int (*mkdir)(const char *name); + int (*rmdir)(const char *name); +} tracefs_ops; + +static char *get_dname(struct dentry *dentry) +{ + const char *dname; + char *name; + int len = dentry->d_name.len; + + dname = dentry->d_name.name; + name = kmalloc(len + 1, GFP_KERNEL); + if (!name) + return NULL; + memcpy(name, dname, len); + name[len] = 0; + return name; +} + +static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode) +{ + char *name; + int ret; + + name = get_dname(dentry); + if (!name) + return -ENOMEM; + + /* + * The mkdir call can call the generic functions that create + * the files within the tracefs system. It is up to the individual + * mkdir routine to handle races. + */ + mutex_unlock(&inode->i_mutex); + ret = tracefs_ops.mkdir(name); + mutex_lock(&inode->i_mutex); + + kfree(name); + + return ret; +} + +static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) +{ + char *name; + int ret; + + name = get_dname(dentry); + if (!name) + return -ENOMEM; + + /* + * The rmdir call can call the generic functions that create + * the files within the tracefs system. It is up to the individual + * rmdir routine to handle races. + * This time we need to unlock not only the parent (inode) but + * also the directory that is being deleted. + */ + mutex_unlock(&inode->i_mutex); + mutex_unlock(&dentry->d_inode->i_mutex); + + ret = tracefs_ops.rmdir(name); + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock(&dentry->d_inode->i_mutex); + + kfree(name); + + return ret; +} + +static const struct inode_operations tracefs_dir_inode_operations = { + .lookup = simple_lookup, + .mkdir = tracefs_syscall_mkdir, + .rmdir = tracefs_syscall_rmdir, +}; + static struct inode *tracefs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -334,6 +412,31 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, return end_creating(dentry); } +static struct dentry *__create_dir(const char *name, struct dentry *parent, + const struct inode_operations *ops) +{ + struct dentry *dentry = start_creating(name, parent); + struct inode *inode; + + if (IS_ERR(dentry)) + return NULL; + + inode = tracefs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + inode->i_op = ops; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + d_instantiate(dentry, inode); + inc_nlink(dentry->d_parent->d_inode); + fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + return end_creating(dentry); +} + /** * tracefs_create_dir - create a directory in the tracefs filesystem * @name: a pointer to a string containing the name of the directory to @@ -353,26 +456,44 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, */ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) { - struct dentry *dentry = start_creating(name, parent); - struct inode *inode; + return __create_dir(name, parent, &simple_dir_inode_operations); +} - if (IS_ERR(dentry)) +/** + * tracefs_create_instance_dir - create the tracing instances directory + * @name: The name of the instances directory to create + * @parent: The parent directory that the instances directory will exist + * @mkdir: The function to call when a mkdir is performed. + * @rmdir: The function to call when a rmdir is performed. + * + * Only one instances directory is allowed. + * + * The instances directory is special as it allows for mkdir and rmdir to + * to be done by userspace. When a mkdir or rmdir is performed, the inode + * locks are released and the methhods passed in (@mkdir and @rmdir) are + * called without locks and with the name of the directory being created + * within the instances directory. + * + * Returns the dentry of the instances directory. + */ +struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent, + int (*mkdir)(const char *name), + int (*rmdir)(const char *name)) +{ + struct dentry *dentry; + + /* Only allow one instance of the instances directory. */ + if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir)) return NULL; - inode = tracefs_get_inode(dentry->d_sb); - if (unlikely(!inode)) - return failed_creating(dentry); + dentry = __create_dir(name, parent, &tracefs_dir_inode_operations); + if (!dentry) + return NULL; - inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; - inode->i_op = &simple_dir_inode_operations; - inode->i_fop = &simple_dir_operations; + tracefs_ops.mkdir = mkdir; + tracefs_ops.rmdir = rmdir; - /* directory inodes start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - d_instantiate(dentry, inode); - inc_nlink(dentry->d_parent->d_inode); - fsnotify_mkdir(dentry->d_parent->d_inode, dentry); - return end_creating(dentry); + return dentry; } static inline int tracefs_positive(struct dentry *dentry) diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h index 23e04ce21749..5b727a17beee 100644 --- a/include/linux/tracefs.h +++ b/include/linux/tracefs.h @@ -34,6 +34,10 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent); void tracefs_remove(struct dentry *dentry); void tracefs_remove_recursive(struct dentry *dentry); +struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent, + int (*mkdir)(const char *name), + int (*rmdir)(const char *name)); + bool tracefs_initialized(void); #endif /* CONFIG_TRACING */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b4aa936509d2..3c8913bac204 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6292,7 +6292,7 @@ static void free_trace_buffers(struct trace_array *tr) #endif } -static int new_instance_create(const char *name) +static int instance_mkdir(const char *name) { struct trace_array *tr; int ret; @@ -6362,7 +6362,7 @@ static int new_instance_create(const char *name) } -static int instance_delete(const char *name) +static int instance_rmdir(const char *name) { struct trace_array *tr; int found = 0; @@ -6403,78 +6403,13 @@ static int instance_delete(const char *name) return ret; } -static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) -{ - struct dentry *parent; - int ret; - - /* Paranoid: Make sure the parent is the "instances" directory */ - parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); - if (WARN_ON_ONCE(parent != trace_instance_dir)) - return -ENOENT; - - /* - * The inode mutex is locked, but tracefs_create_dir() will also - * take the mutex. As the instances directory can not be destroyed - * or changed in any other way, it is safe to unlock it, and - * let the dentry try. If two users try to make the same dir at - * the same time, then the new_instance_create() will determine the - * winner. - */ - mutex_unlock(&inode->i_mutex); - - ret = new_instance_create(dentry->d_iname); - - mutex_lock(&inode->i_mutex); - - return ret; -} - -static int instance_rmdir(struct inode *inode, struct dentry *dentry) -{ - struct dentry *parent; - int ret; - - /* Paranoid: Make sure the parent is the "instances" directory */ - parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); - if (WARN_ON_ONCE(parent != trace_instance_dir)) - return -ENOENT; - - /* The caller did a dget() on dentry */ - mutex_unlock(&dentry->d_inode->i_mutex); - - /* - * The inode mutex is locked, but tracefs_create_dir() will also - * take the mutex. As the instances directory can not be destroyed - * or changed in any other way, it is safe to unlock it, and - * let the dentry try. If two users try to make the same dir at - * the same time, then the instance_delete() will determine the - * winner. - */ - mutex_unlock(&inode->i_mutex); - - ret = instance_delete(dentry->d_iname); - - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock(&dentry->d_inode->i_mutex); - - return ret; -} - -static const struct inode_operations instance_dir_inode_operations = { - .lookup = simple_lookup, - .mkdir = instance_mkdir, - .rmdir = instance_rmdir, -}; - static __init void create_trace_instances(struct dentry *d_tracer) { - trace_instance_dir = tracefs_create_dir("instances", d_tracer); + trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, + instance_mkdir, + instance_rmdir); if (WARN_ON(!trace_instance_dir)) return; - - /* Hijack the dir inode operations, to allow mkdir */ - trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; } static void -- cgit v1.2.3 From 50f16a8bf9d7a92c437ed1867d0f7e1dc6a9aca9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Mar 2015 22:10:19 +0100 Subject: perf: Remove type specific target pointers The only reason CQM had to use a hard-coded pmu type was so it could use cqm_target in hw_perf_event. Do away with the {tp,bp,cqm}_target pointers and provide a non type specific one. This allows us to do away with that silly pmu type as well. Signed-off-by: Peter Zijlstra (Intel) Cc: Vince Weaver Cc: acme@kernel.org Cc: acme@redhat.com Cc: hpa@zytor.com Cc: jolsa@redhat.com Cc: kanaka.d.juvva@intel.com Cc: matt.fleming@intel.com Cc: tglx@linutronix.de Cc: torvalds@linux-foundation.org Cc: vikas.shivappa@linux.intel.com Link: http://lkml.kernel.org/r/20150305211019.GU21418@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/arm/kernel/hw_breakpoint.c | 2 +- arch/arm64/kernel/hw_breakpoint.c | 2 +- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 7 +++---- include/linux/perf_event.h | 4 +--- include/uapi/linux/perf_event.h | 1 - kernel/events/core.c | 14 ++++---------- kernel/events/hw_breakpoint.c | 8 ++++---- kernel/trace/trace_uprobe.c | 10 +++++----- 8 files changed, 19 insertions(+), 29 deletions(-) (limited to 'kernel/trace') diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 7fc70ae21185..dc7d0a95bd36 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -648,7 +648,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) * Per-cpu breakpoints are not supported by our stepping * mechanism. */ - if (!bp->hw.bp_target) + if (!bp->hw.target) return -EINVAL; /* diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index df1cf15377b4..d062f35911c2 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -527,7 +527,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) * Disallow per-task kernel breakpoints since these would * complicate the stepping code. */ - if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.bp_target) + if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.target) return -EINVAL; return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 9a8ef8376fcd..e4d1b8b738fa 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -263,7 +263,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) /* * Events that target same task are placed into the same cache group. */ - if (a->hw.cqm_target == b->hw.cqm_target) + if (a->hw.target == b->hw.target) return true; /* @@ -279,7 +279,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) { if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.cqm_target); + return perf_cgroup_from_task(event->hw.target); return event->cgrp; } @@ -1365,8 +1365,7 @@ static int __init intel_cqm_init(void) __perf_cpu_notifier(intel_cqm_cpu_notifier); - ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", - PERF_TYPE_INTEL_CQM); + ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); if (ret) pr_err("Intel CQM perf registration failed: %d\n", ret); else diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index dac4c2831d82..5aa49d7bfd07 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -119,7 +119,6 @@ struct hw_perf_event { struct hrtimer hrtimer; }; struct { /* tracepoint */ - struct task_struct *tp_target; /* for tp_event->class */ struct list_head tp_list; }; @@ -129,7 +128,6 @@ struct hw_perf_event { struct list_head cqm_events_entry; struct list_head cqm_groups_entry; struct list_head cqm_group_entry; - struct task_struct *cqm_target; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ @@ -138,12 +136,12 @@ struct hw_perf_event { * problem hw_breakpoint has with context * creation and event initalization. */ - struct task_struct *bp_target; struct arch_hw_breakpoint info; struct list_head bp_list; }; #endif }; + struct task_struct *target; int state; local64_t prev_count; u64 sample_period; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 3c8b45de57ec..1e3cd07cf76e 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -32,7 +32,6 @@ enum perf_type_id { PERF_TYPE_HW_CACHE = 3, PERF_TYPE_RAW = 4, PERF_TYPE_BREAKPOINT = 5, - PERF_TYPE_INTEL_CQM = 6, PERF_TYPE_MAX, /* non-ABI */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 71109a045450..525062b6fba1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7171,18 +7171,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (task) { event->attach_state = PERF_ATTACH_TASK; - - if (attr->type == PERF_TYPE_TRACEPOINT) - event->hw.tp_target = task; -#ifdef CONFIG_HAVE_HW_BREAKPOINT /* - * hw_breakpoint is a bit difficult here.. + * XXX pmu::event_init needs to know what task to account to + * and we cannot use the ctx information because we need the + * pmu before we get a ctx. */ - else if (attr->type == PERF_TYPE_BREAKPOINT) - event->hw.bp_target = task; -#endif - else if (attr->type == PERF_TYPE_INTEL_CQM) - event->hw.cqm_target = task; + event->hw.target = task; } if (!overflow_handler && parent_event) { diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9803a6600d49..92ce5f4ccc26 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { - struct task_struct *tsk = bp->hw.bp_target; + struct task_struct *tsk = bp->hw.target; struct perf_event *iter; int count = 0; list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.bp_target == tsk && + if (iter->hw.target == tsk && find_slot_idx(iter) == type && (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); @@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, int nr; nr = info->cpu_pinned; - if (!bp->hw.bp_target) + if (!bp->hw.target) nr += max_task_bp_pinned(cpu, type); else nr += task_bp_pinned(cpu, bp, type); @@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, weight = -weight; /* Pinned counter cpu profiling */ - if (!bp->hw.bp_target) { + if (!bp->hw.target) { get_bp_info(bp->cpu, type)->cpu_pinned += weight; return; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b11441321e7a..93fdc7791eaa 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1005,7 +1005,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) return true; list_for_each_entry(event, &filter->perf_events, hw.tp_list) { - if (event->hw.tp_target->mm == mm) + if (event->hw.target->mm == mm) return true; } @@ -1015,7 +1015,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) static inline bool uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) { - return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); + return __uprobe_perf_filter(&tu->filter, event->hw.target->mm); } static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) @@ -1023,10 +1023,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) bool done; write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) { + if (event->hw.target) { list_del(&event->hw.tp_list); done = tu->filter.nr_systemwide || - (event->hw.tp_target->flags & PF_EXITING) || + (event->hw.target->flags & PF_EXITING) || uprobe_filter_event(tu, event); } else { tu->filter.nr_systemwide--; @@ -1046,7 +1046,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) int err; write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) { + if (event->hw.target) { /* * event->parent != NULL means copy_process(), we can avoid * uprobe_apply(). current->mm must be probed and we can rely -- cgit v1.2.3 From 80a9b64e2c156b6523e7a01f2ba6e5d86e722814 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Mar 2015 10:40:38 -0400 Subject: ring-buffer: Replace this_cpu_*() with __this_cpu_*() It has come to my attention that this_cpu_read/write are horrible on architectures other than x86. Worse yet, they actually disable preemption or interrupts! This caused some unexpected tracing results on ARM. 101.356868: preempt_count_add <-ring_buffer_lock_reserve 101.356870: preempt_count_sub <-ring_buffer_lock_reserve The ring_buffer_lock_reserve has recursion protection that requires accessing a per cpu variable. But since preempt_disable() is traced, it too got traced while accessing the variable that is suppose to prevent recursion like this. The generic version of this_cpu_read() and write() are: #define this_cpu_generic_read(pcp) \ ({ typeof(pcp) ret__; \ preempt_disable(); \ ret__ = *this_cpu_ptr(&(pcp)); \ preempt_enable(); \ ret__; \ }) #define this_cpu_generic_to_op(pcp, val, op) \ do { \ unsigned long flags; \ raw_local_irq_save(flags); \ *__this_cpu_ptr(&(pcp)) op val; \ raw_local_irq_restore(flags); \ } while (0) Which is unacceptable for locations that know they are within preempt disabled or interrupt disabled locations. Paul McKenney stated that __this_cpu_() versions produce much better code on other architectures than this_cpu_() does, if we know that the call is done in a preempt disabled location. I also changed the recursive_unlock() to use two local variables instead of accessing the per_cpu variable twice. Link: http://lkml.kernel.org/r/20150317114411.GE3589@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/20150317104038.312e73d1@gandalf.local.home Cc: stable@vger.kernel.org Acked-by: Christoph Lameter Reported-by: Uwe Kleine-Koenig Tested-by: Uwe Kleine-Koenig Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5040d44fe5a3..922048a0f7ea 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2679,7 +2679,7 @@ static DEFINE_PER_CPU(unsigned int, current_context); static __always_inline int trace_recursive_lock(void) { - unsigned int val = this_cpu_read(current_context); + unsigned int val = __this_cpu_read(current_context); int bit; if (in_interrupt()) { @@ -2696,18 +2696,17 @@ static __always_inline int trace_recursive_lock(void) return 1; val |= (1 << bit); - this_cpu_write(current_context, val); + __this_cpu_write(current_context, val); return 0; } static __always_inline void trace_recursive_unlock(void) { - unsigned int val = this_cpu_read(current_context); + unsigned int val = __this_cpu_read(current_context); - val--; - val &= this_cpu_read(current_context); - this_cpu_write(current_context, val); + val &= val & (val - 1); + __this_cpu_write(current_context, val); } #else -- cgit v1.2.3 From bbedb179944c29e5e449603163eec9951116fe39 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 11 Mar 2015 22:13:57 -0500 Subject: tracing: %pF is only for function pointers Use %pS for actual addresses, otherwise you'll get bad output on arches like ppc64 where %pF expects a function descriptor. Link: http://lkml.kernel.org/r/1426130037-17956-22-git-send-email-scottwood@freescale.com Signed-off-by: Scott Wood Signed-off-by: Steven Rostedt --- include/trace/events/btrfs.h | 4 ++-- include/trace/events/ext3.h | 2 +- include/trace/events/ext4.h | 6 +++--- include/trace/events/module.h | 4 ++-- include/trace/events/random.h | 10 +++++----- kernel/trace/trace_entries.h | 6 +++--- tools/lib/traceevent/event-parse.c | 2 +- 7 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel/trace') diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 1faecea101f3..572e6503394a 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -962,7 +962,7 @@ TRACE_EVENT(alloc_extent_state, __entry->ip = IP ), - TP_printk("state=%p; mask = %s; caller = %pF", __entry->state, + TP_printk("state=%p; mask = %s; caller = %pS", __entry->state, show_gfp_flags(__entry->mask), (void *)__entry->ip) ); @@ -982,7 +982,7 @@ TRACE_EVENT(free_extent_state, __entry->ip = IP ), - TP_printk(" state=%p; caller = %pF", __entry->state, + TP_printk(" state=%p; caller = %pS", __entry->state, (void *)__entry->ip) ); diff --git a/include/trace/events/ext3.h b/include/trace/events/ext3.h index 6797b9de90ed..7f20707849bb 100644 --- a/include/trace/events/ext3.h +++ b/include/trace/events/ext3.h @@ -144,7 +144,7 @@ TRACE_EVENT(ext3_mark_inode_dirty, __entry->ip = IP; ), - TP_printk("dev %d,%d ino %lu caller %pF", + TP_printk("dev %d,%d ino %lu caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (void *)__entry->ip) ); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6e5abd6d38a2..47fca36ee426 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -240,7 +240,7 @@ TRACE_EVENT(ext4_mark_inode_dirty, __entry->ip = IP; ), - TP_printk("dev %d,%d ino %lu caller %pF", + TP_printk("dev %d,%d ino %lu caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (void *)__entry->ip) ); @@ -1762,7 +1762,7 @@ TRACE_EVENT(ext4_journal_start, __entry->rsv_blocks = rsv_blocks; ), - TP_printk("dev %d,%d blocks, %d rsv_blocks, %d caller %pF", + TP_printk("dev %d,%d blocks, %d rsv_blocks, %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks, (void *)__entry->ip) ); @@ -1784,7 +1784,7 @@ TRACE_EVENT(ext4_journal_start_reserved, __entry->blocks = blocks; ), - TP_printk("dev %d,%d blocks, %d caller %pF", + TP_printk("dev %d,%d blocks, %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, (void *)__entry->ip) ); diff --git a/include/trace/events/module.h b/include/trace/events/module.h index 81c4c183d348..28c45997e451 100644 --- a/include/trace/events/module.h +++ b/include/trace/events/module.h @@ -84,7 +84,7 @@ DECLARE_EVENT_CLASS(module_refcnt, __assign_str(name, mod->name); ), - TP_printk("%s call_site=%pf refcnt=%d", + TP_printk("%s call_site=%ps refcnt=%d", __get_str(name), (void *)__entry->ip, __entry->refcnt) ); @@ -121,7 +121,7 @@ TRACE_EVENT(module_request, __assign_str(name, name); ), - TP_printk("%s wait=%d call_site=%pf", + TP_printk("%s wait=%d call_site=%ps", __get_str(name), (int)__entry->wait, (void *)__entry->ip) ); diff --git a/include/trace/events/random.h b/include/trace/events/random.h index 805af6db41cc..4684de344c5d 100644 --- a/include/trace/events/random.h +++ b/include/trace/events/random.h @@ -22,7 +22,7 @@ TRACE_EVENT(add_device_randomness, __entry->IP = IP; ), - TP_printk("bytes %d caller %pF", + TP_printk("bytes %d caller %pS", __entry->bytes, (void *)__entry->IP) ); @@ -43,7 +43,7 @@ DECLARE_EVENT_CLASS(random__mix_pool_bytes, __entry->IP = IP; ), - TP_printk("%s pool: bytes %d caller %pF", + TP_printk("%s pool: bytes %d caller %pS", __entry->pool_name, __entry->bytes, (void *)__entry->IP) ); @@ -82,7 +82,7 @@ TRACE_EVENT(credit_entropy_bits, ), TP_printk("%s pool: bits %d entropy_count %d entropy_total %d " - "caller %pF", __entry->pool_name, __entry->bits, + "caller %pS", __entry->pool_name, __entry->bits, __entry->entropy_count, __entry->entropy_total, (void *)__entry->IP) ); @@ -207,7 +207,7 @@ DECLARE_EVENT_CLASS(random__get_random_bytes, __entry->IP = IP; ), - TP_printk("nbytes %d caller %pF", __entry->nbytes, (void *)__entry->IP) + TP_printk("nbytes %d caller %pS", __entry->nbytes, (void *)__entry->IP) ); DEFINE_EVENT(random__get_random_bytes, get_random_bytes, @@ -242,7 +242,7 @@ DECLARE_EVENT_CLASS(random__extract_entropy, __entry->IP = IP; ), - TP_printk("%s pool: nbytes %d entropy_count %d caller %pF", + TP_printk("%s pool: nbytes %d entropy_count %d caller %pS", __entry->pool_name, __entry->nbytes, __entry->entropy_count, (void *)__entry->IP) ); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e2d027ac66a2..ee7b94a4810a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -223,7 +223,7 @@ FTRACE_ENTRY(bprint, bprint_entry, __dynamic_array( u32, buf ) ), - F_printk("%pf: %s", + F_printk("%ps: %s", (void *)__entry->ip, __entry->fmt), FILTER_OTHER @@ -238,7 +238,7 @@ FTRACE_ENTRY(print, print_entry, __dynamic_array( char, buf ) ), - F_printk("%pf: %s", + F_printk("%ps: %s", (void *)__entry->ip, __entry->buf), FILTER_OTHER @@ -253,7 +253,7 @@ FTRACE_ENTRY(bputs, bputs_entry, __field( const char *, str ) ), - F_printk("%pf: %s", + F_printk("%ps: %s", (void *)__entry->ip, __entry->str), FILTER_OTHER diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index afe20ed9fac8..2c0bd8f2aad0 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -3976,7 +3976,7 @@ static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struc if (asprintf(&arg->atom.atom, "%lld", ip) < 0) goto out_free; - /* skip the first "%pf: " */ + /* skip the first "%ps: " */ for (ptr = fmt + 5, bptr = data + field->offset; bptr < data + size && *ptr; ptr++) { int ls = 0; -- cgit v1.2.3 From 754cb0071a5c9576ccfa6523969ef6a2f6a71676 Mon Sep 17 00:00:00 2001 From: He Kuang Date: Tue, 3 Mar 2015 15:21:33 +0800 Subject: tracing: remove ftrace:function TRACE_EVENT_FL_USE_CALL_FILTER flag TRACE_EVENT_FL_USE_CALL_FILTER flag in ftrace:functon event can be removed. This flag was first introduced in commit f306cc82a93d ("tracing: Update event filters for multibuffer"). Now, the only place uses this flag is ftrace:function, but the filter of ftrace:function has a different code path with events/syscalls and events/tracepoints. It uses ftrace_filter_write() and perf's ftrace_profile_set_filter() to set the filter, the functionality of file 'tracing/events/ftrace/function/filter' is bypassed in function init_pred(), in which case, neither call->filter nor file->filter is used. So we can safely remove TRACE_EVENT_FL_USE_CALL_FILTER flag from ftrace:function events. Link: http://lkml.kernel.org/r/1425367294-27852-1-git-send-email-hekuang@huawei.com Signed-off-by: He Kuang Signed-off-by: Steven Rostedt --- kernel/trace/trace_export.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 12e2b99be862..174a6a71146c 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -177,7 +177,7 @@ struct ftrace_event_call __used event_##call = { \ }, \ .event.type = etype, \ .print_fmt = print, \ - .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ + .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; -- cgit v1.2.3 From d9a16d3ab8770357015c85a07387f1d2676a4773 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 12 Mar 2015 16:58:34 +1100 Subject: trace: Don't use __weak in header files The commit that added a check for this to checkpatch says: "Using weak declarations can have unintended link defects. The __weak on the declaration causes non-weak definitions to become weak." In this case, when a PowerPC kernel is built with CONFIG_KPROBE_EVENT but not CONFIG_UPROBE_EVENT, it generates the following warning: WARNING: 1 bad relocations c0000000014f2190 R_PPC64_ADDR64 uprobes_fetch_type_table This is fixed by passing the fetch_table arrays to traceprobe_parse_probe_arg() which also means that they can never be NULL. Link: http://lkml.kernel.org/r/20150312165834.4482cb48@canb.auug.org.au Acked-by: Masami Hiramatsu Signed-off-by: Stephen Rothwell Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 5 +++-- kernel/trace/trace_probe.c | 19 +++++++------------ kernel/trace/trace_probe.h | 10 ++-------- kernel/trace/trace_uprobe.c | 5 +++-- 4 files changed, 15 insertions(+), 24 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d73f565b4e06..f34c3ad1b5f4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -250,7 +250,7 @@ DEFINE_FETCH_symbol(string_size) #define fetch_file_offset_string_size NULL /* Fetch type information table */ -const struct fetch_type kprobes_fetch_type_table[] = { +static const struct fetch_type kprobes_fetch_type_table[] = { /* Special types */ [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, "__data_loc char[]"), @@ -760,7 +760,8 @@ static int create_trace_kprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, - is_return, true); + is_return, true, + kprobes_fetch_type_table); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index b983b2fd2ca1..1769a81da8a7 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -356,17 +356,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, /* Recursive argument parser */ static int parse_probe_arg(char *arg, const struct fetch_type *t, - struct fetch_param *f, bool is_return, bool is_kprobe) + struct fetch_param *f, bool is_return, bool is_kprobe, + const struct fetch_type *ftbl) { - const struct fetch_type *ftbl; unsigned long param; long offset; char *tmp; int ret = 0; - ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; - BUG_ON(ftbl == NULL); - switch (arg[0]) { case '$': ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); @@ -447,7 +444,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, dprm->fetch_size = get_fetch_size_function(t, dprm->fetch, ftbl); ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, - is_kprobe); + is_kprobe, ftbl); if (ret) kfree(dprm); else { @@ -505,15 +502,12 @@ static int __parse_bitfield_probe_arg(const char *bf, /* String length checking wrapper */ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe) + struct probe_arg *parg, bool is_return, bool is_kprobe, + const struct fetch_type *ftbl) { - const struct fetch_type *ftbl; const char *t; int ret; - ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; - BUG_ON(ftbl == NULL); - if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); return -ENOSPC; @@ -535,7 +529,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, } parg->offset = *size; *size += parg->type->size; - ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe); + ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, + is_kprobe, ftbl); if (ret >= 0 && t != NULL) ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 4f815fbce16d..e30f6cce4af6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -229,13 +229,6 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \ #define FETCH_TYPE_STRING 0 #define FETCH_TYPE_STRSIZE 1 -/* - * Fetch type information table. - * It's declared as a weak symbol due to conditional compilation. - */ -extern __weak const struct fetch_type kprobes_fetch_type_table[]; -extern __weak const struct fetch_type uprobes_fetch_type_table[]; - #ifdef CONFIG_KPROBE_EVENT struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); @@ -333,7 +326,8 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) } extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe); + struct probe_arg *parg, bool is_return, bool is_kprobe, + const struct fetch_type *ftbl); extern int traceprobe_conflict_field_name(const char *name, struct probe_arg *args, int narg); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 7dc1c8abecd6..74865465e0b7 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -196,7 +196,7 @@ DEFINE_FETCH_file_offset(string) DEFINE_FETCH_file_offset(string_size) /* Fetch type information table */ -const struct fetch_type uprobes_fetch_type_table[] = { +static const struct fetch_type uprobes_fetch_type_table[] = { /* Special types */ [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, "__data_loc char[]"), @@ -535,7 +535,8 @@ static int create_trace_uprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, - is_return, false); + is_return, false, + uprobes_fetch_type_table); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; -- cgit v1.2.3 From d631c8cceb1d1d06f372878935949d421585186b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 27 Mar 2015 17:39:49 -0400 Subject: ring-buffer: Remove duplicate use of '&' in recursive code A clean up of the recursive protection code changed val = this_cpu_read(current_context); val--; val &= this_cpu_read(current_context); to val = this_cpu_read(current_context); val &= val & (val - 1); Which has a duplicate use of '&' as the above is the same as val = val & (val - 1); Actually, it would be best to remove that line altogether and just add it to where it is used. And Christoph even mentioned that it can be further compacted to just a single line: __this_cpu_and(current_context, __this_cpu_read(current_context) - 1); Link: http://lkml.kernel.org/alpine.DEB.2.11.1503271423580.23114@gentwo.org Suggested-by: Christoph Lameter Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 922048a0f7ea..0315d43176d8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2703,10 +2703,7 @@ static __always_inline int trace_recursive_lock(void) static __always_inline void trace_recursive_unlock(void) { - unsigned int val = __this_cpu_read(current_context); - - val &= val & (val - 1); - __this_cpu_write(current_context, val); + __this_cpu_and(current_context, __this_cpu_read(current_context) - 1); } #else -- cgit v1.2.3 From 72cbbc8994242b5b43753738c01bf07bf29cb70d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 25 Mar 2015 12:49:19 -0700 Subject: tracing: Add kprobe flag add TRACE_EVENT_FL_KPROBE flag to differentiate kprobe type of tracepoints, since bpf programs can only be attached to kprobe type of PERF_TYPE_TRACEPOINT perf events. Signed-off-by: Alexei Starovoitov Reviewed-by: Steven Rostedt Reviewed-by: Masami Hiramatsu Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1427312966-8434-3-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 3 +++ kernel/trace/trace_kprobe.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index c674ee8f7fca..77325e1a1816 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -252,6 +252,7 @@ enum { TRACE_EVENT_FL_WAS_ENABLED_BIT, TRACE_EVENT_FL_USE_CALL_FILTER_BIT, TRACE_EVENT_FL_TRACEPOINT_BIT, + TRACE_EVENT_FL_KPROBE_BIT, }; /* @@ -265,6 +266,7 @@ enum { * it is best to clear the buffers that used it). * USE_CALL_FILTER - For ftrace internal events, don't use file filter * TRACEPOINT - Event is a tracepoint + * KPROBE - Event is a kprobe */ enum { TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), @@ -274,6 +276,7 @@ enum { TRACE_EVENT_FL_WAS_ENABLED = (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT), TRACE_EVENT_FL_USE_CALL_FILTER = (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT), TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), + TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), }; struct ftrace_event_call { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d73f565b4e06..8fa549f6f528 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1286,7 +1286,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) kfree(call->print_fmt); return -ENODEV; } - call->flags = 0; + call->flags = TRACE_EVENT_FL_KPROBE; call->class->reg = kprobe_register; call->data = tk; ret = trace_add_event_call(call); -- cgit v1.2.3 From 2541517c32be2531e0da59dfd7efc1ce844644f5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 25 Mar 2015 12:49:20 -0700 Subject: tracing, perf: Implement BPF programs attached to kprobes BPF programs, attached to kprobes, provide a safe way to execute user-defined BPF byte-code programs without being able to crash or hang the kernel in any way. The BPF engine makes sure that such programs have a finite execution time and that they cannot break out of their sandbox. The user interface is to attach to a kprobe via the perf syscall: struct perf_event_attr attr = { .type = PERF_TYPE_TRACEPOINT, .config = event_id, ... }; event_fd = perf_event_open(&attr,...); ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); 'prog_fd' is a file descriptor associated with BPF program previously loaded. 'event_id' is an ID of the kprobe created. Closing 'event_fd': close(event_fd); ... automatically detaches BPF program from it. BPF programs can call in-kernel helper functions to: - lookup/update/delete elements in maps - probe_read - wraper of probe_kernel_read() used to access any kernel data structures BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is architecture dependent) and return 0 to ignore the event and 1 to store kprobe event into the ring buffer. Note, kprobes are a fundamentally _not_ a stable kernel ABI, so BPF programs attached to kprobes must be recompiled for every kernel version and user must supply correct LINUX_VERSION_CODE in attr.kern_version during bpf_prog_load() call. Signed-off-by: Alexei Starovoitov Reviewed-by: Steven Rostedt Reviewed-by: Masami Hiramatsu Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 11 ++++ include/uapi/linux/bpf.h | 3 + include/uapi/linux/perf_event.h | 1 + kernel/bpf/syscall.c | 7 ++- kernel/events/core.c | 59 ++++++++++++++++++ kernel/trace/Makefile | 1 + kernel/trace/bpf_trace.c | 130 ++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_kprobe.c | 8 +++ 8 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 kernel/trace/bpf_trace.c (limited to 'kernel/trace') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 77325e1a1816..0aa535bc9f05 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -13,6 +13,7 @@ struct trace_array; struct trace_buffer; struct tracer; struct dentry; +struct bpf_prog; struct trace_print_flags { unsigned long mask; @@ -306,6 +307,7 @@ struct ftrace_event_call { #ifdef CONFIG_PERF_EVENTS int perf_refcount; struct hlist_head __percpu *perf_events; + struct bpf_prog *prog; int (*perf_perm)(struct ftrace_event_call *, struct perf_event *); @@ -551,6 +553,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file, event_triggers_post_call(file, tt); } +#ifdef CONFIG_BPF_SYSCALL +unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx); +#else +static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +{ + return 1; +} +#endif + enum { FILTER_OTHER = 0, FILTER_STATIC_STRING, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 45da7ec7d274..b2948feeb70b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -118,6 +118,7 @@ enum bpf_map_type { enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_KPROBE, }; /* flags for BPF_MAP_UPDATE_ELEM command */ @@ -151,6 +152,7 @@ union bpf_attr { __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ + __u32 kern_version; /* checked when prog_type=kprobe */ }; } __attribute__((aligned(8))); @@ -162,6 +164,7 @@ enum bpf_func_id { BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ + BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ __BPF_FUNC_MAX_ID, }; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 3bb40ddadbe5..91803e54ee73 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -381,6 +381,7 @@ struct perf_event_attr { #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) +#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 536edc2be307..504c10b990ef 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -16,6 +16,7 @@ #include #include #include +#include static LIST_HEAD(bpf_map_types); @@ -467,7 +468,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD log_buf +#define BPF_PROG_LOAD_LAST_FIELD kern_version static int bpf_prog_load(union bpf_attr *attr) { @@ -492,6 +493,10 @@ static int bpf_prog_load(union bpf_attr *attr) if (attr->insn_cnt >= BPF_MAXINSNS) return -EINVAL; + if (type == BPF_PROG_TYPE_KPROBE && + attr->kern_version != LINUX_VERSION_CODE) + return -EINVAL; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) diff --git a/kernel/events/core.c b/kernel/events/core.c index c40c2cac2d8e..5c13862d3e85 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -42,6 +42,8 @@ #include #include #include +#include +#include #include "internal.h" @@ -3407,6 +3409,7 @@ errout: } static void perf_event_free_filter(struct perf_event *event); +static void perf_event_free_bpf_prog(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { @@ -3416,6 +3419,7 @@ static void free_event_rcu(struct rcu_head *head) if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); + perf_event_free_bpf_prog(event); kfree(event); } @@ -3928,6 +3932,7 @@ static inline int perf_fget_light(int fd, struct fd *p) static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { @@ -3981,6 +3986,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); + case PERF_EVENT_IOC_SET_BPF: + return perf_event_set_bpf_prog(event, arg); + default: return -ENOTTY; } @@ -6455,6 +6463,49 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +{ + struct bpf_prog *prog; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + if (event->tp_event->prog) + return -EEXIST; + + if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) + /* bpf programs can only be attached to kprobes */ + return -EINVAL; + + prog = bpf_prog_get(prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) { + /* valid fd, but invalid bpf program type */ + bpf_prog_put(prog); + return -EINVAL; + } + + event->tp_event->prog = prog; + + return 0; +} + +static void perf_event_free_bpf_prog(struct perf_event *event) +{ + struct bpf_prog *prog; + + if (!event->tp_event) + return; + + prog = event->tp_event->prog; + if (prog) { + event->tp_event->prog = NULL; + bpf_prog_put(prog); + } +} + #else static inline void perf_tp_register(void) @@ -6470,6 +6521,14 @@ static void perf_event_free_filter(struct perf_event *event) { } +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +{ + return -ENOENT; +} + +static void perf_event_free_bpf_prog(struct perf_event *event) +{ +} #endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 98f26588255e..c575a300103b 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM),y) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c new file mode 100644 index 000000000000..f1e87da91da3 --- /dev/null +++ b/kernel/trace/bpf_trace.c @@ -0,0 +1,130 @@ +/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include "trace.h" + +static DEFINE_PER_CPU(int, bpf_prog_active); + +/** + * trace_call_bpf - invoke BPF program + * @prog: BPF program + * @ctx: opaque context pointer + * + * kprobe handlers execute BPF programs via this helper. + * Can be used from static tracepoints in the future. + * + * Return: BPF programs always return an integer which is interpreted by + * kprobe handler as: + * 0 - return from kprobe (event is filtered out) + * 1 - store kprobe event into ring buffer + * Other values are reserved and currently alias to 1 + */ +unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +{ + unsigned int ret; + + if (in_nmi()) /* not supported yet */ + return 1; + + preempt_disable(); + + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { + /* + * since some bpf program is already running on this cpu, + * don't call into another bpf program (same or different) + * and don't send kprobe event into ring-buffer, + * so return zero here + */ + ret = 0; + goto out; + } + + rcu_read_lock(); + ret = BPF_PROG_RUN(prog, ctx); + rcu_read_unlock(); + + out: + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_call_bpf); + +static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + void *dst = (void *) (long) r1; + int size = (int) r2; + void *unsafe_ptr = (void *) (long) r3; + + return probe_kernel_read(dst, unsafe_ptr, size); +} + +static const struct bpf_func_proto bpf_probe_read_proto = { + .func = bpf_probe_read, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_probe_read: + return &bpf_probe_read_proto; + default: + return NULL; + } +} + +/* bpf+kprobe programs can access fields of 'struct pt_regs' */ +static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) +{ + /* check bounds */ + if (off < 0 || off >= sizeof(struct pt_regs)) + return false; + + /* only read is allowed */ + if (type != BPF_READ) + return false; + + /* disallow misaligned access */ + if (off % size != 0) + return false; + + return true; +} + +static struct bpf_verifier_ops kprobe_prog_ops = { + .get_func_proto = kprobe_prog_func_proto, + .is_valid_access = kprobe_prog_is_valid_access, +}; + +static struct bpf_prog_type_list kprobe_tl = { + .ops = &kprobe_prog_ops, + .type = BPF_PROG_TYPE_KPROBE, +}; + +static int __init register_kprobe_prog_ops(void) +{ + bpf_register_prog_type(&kprobe_tl); + return 0; +} +late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8fa549f6f528..dc3462507d7c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1134,11 +1134,15 @@ static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct ftrace_event_call *call = &tk->tp.call; + struct bpf_prog *prog = call->prog; struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; @@ -1165,11 +1169,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { struct ftrace_event_call *call = &tk->tp.call; + struct bpf_prog *prog = call->prog; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; -- cgit v1.2.3 From d9847d310ab4003725e6ed1822682e24bd406908 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 25 Mar 2015 12:49:21 -0700 Subject: tracing: Allow BPF programs to call bpf_ktime_get_ns() bpf_ktime_get_ns() is used by programs to compute time delta between events or as a timestamp Signed-off-by: Alexei Starovoitov Reviewed-by: Steven Rostedt Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1427312966-8434-5-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- include/uapi/linux/bpf.h | 1 + kernel/trace/bpf_trace.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'kernel/trace') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b2948feeb70b..238c6883877b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -165,6 +165,7 @@ enum bpf_func_id { BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ + BPF_FUNC_ktime_get_ns, /* u64 bpf_ktime_get_ns(void) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f1e87da91da3..8f5787294971 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -78,6 +78,18 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; +static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* NMI safe access to clock monotonic */ + return ktime_get_mono_fast_ns(); +} + +static const struct bpf_func_proto bpf_ktime_get_ns_proto = { + .func = bpf_ktime_get_ns, + .gpl_only = true, + .ret_type = RET_INTEGER, +}; + static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -89,6 +101,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_map_delete_elem_proto; case BPF_FUNC_probe_read: return &bpf_probe_read_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; default: return NULL; } -- cgit v1.2.3 From 9c959c863f8217a2ff3d7c296e8223654d240569 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 25 Mar 2015 12:49:22 -0700 Subject: tracing: Allow BPF programs to call bpf_trace_printk() Debugging of BPF programs needs some form of printk from the program, so let programs call limited trace_printk() with %d %u %x %p modifiers only. Similar to kernel modules, during program load verifier checks whether program is calling bpf_trace_printk() and if so, kernel allocates trace_printk buffers and emits big 'this is debug only' banner. Signed-off-by: Alexei Starovoitov Reviewed-by: Steven Rostedt Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1427312966-8434-6-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- include/uapi/linux/bpf.h | 1 + kernel/trace/bpf_trace.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) (limited to 'kernel/trace') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 238c6883877b..cc47ef41076a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -166,6 +166,7 @@ enum bpf_func_id { BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ BPF_FUNC_ktime_get_ns, /* u64 bpf_ktime_get_ns(void) */ + BPF_FUNC_trace_printk, /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 8f5787294971..2d56ce501632 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "trace.h" static DEFINE_PER_CPU(int, bpf_prog_active); @@ -90,6 +91,74 @@ static const struct bpf_func_proto bpf_ktime_get_ns_proto = { .ret_type = RET_INTEGER, }; +/* + * limited trace_printk() + * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed + */ +static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) +{ + char *fmt = (char *) (long) r1; + int mod[3] = {}; + int fmt_cnt = 0; + int i; + + /* + * bpf_check()->check_func_arg()->check_stack_boundary() + * guarantees that fmt points to bpf program stack, + * fmt_size bytes of it were initialized and fmt_size > 0 + */ + if (fmt[--fmt_size] != 0) + return -EINVAL; + + /* check format string for allowed specifiers */ + for (i = 0; i < fmt_size; i++) { + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) + return -EINVAL; + + if (fmt[i] != '%') + continue; + + if (fmt_cnt >= 3) + return -EINVAL; + + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ + i++; + if (fmt[i] == 'l') { + mod[fmt_cnt]++; + i++; + } else if (fmt[i] == 'p') { + mod[fmt_cnt]++; + i++; + if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) + return -EINVAL; + fmt_cnt++; + continue; + } + + if (fmt[i] == 'l') { + mod[fmt_cnt]++; + i++; + } + + if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') + return -EINVAL; + fmt_cnt++; + } + + return __trace_printk(1/* fake ip will not be printed */, fmt, + mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, + mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, + mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); +} + +static const struct bpf_func_proto bpf_trace_printk_proto = { + .func = bpf_trace_printk, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, +}; + static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -103,6 +172,15 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + + case BPF_FUNC_trace_printk: + /* + * this program might be calling bpf_trace_printk, + * so allocate per-cpu printk buffers + */ + trace_printk_init_buffers(); + + return &bpf_trace_printk_proto; default: return NULL; } -- cgit v1.2.3 From e1abf2cc8d5d80b41c4419368ec743ccadbb131e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 2 Apr 2015 15:51:39 +0200 Subject: bpf: Fix the build on BPF_SYSCALL=y && !CONFIG_TRACING kernels, make it more configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So bpf_tracing.o depends on CONFIG_BPF_SYSCALL - but that's not its only dependency, it also depends on the tracing infrastructure and on kprobes, without which it will fail to build with: In file included from kernel/trace/bpf_trace.c:14:0: kernel/trace/trace.h: In function ‘trace_test_and_set_recursion’: kernel/trace/trace.h:491:28: error: ‘struct task_struct’ has no member named ‘trace_recursion’ unsigned int val = current->trace_recursion; [...] It took quite some time to trigger this build failure, because right now BPF_SYSCALL is very obscure, depends on CONFIG_EXPERT. So also make BPF_SYSCALL more configurable, not just under CONFIG_EXPERT. If BPF_SYSCALL, tracing and kprobes are enabled then enable the bpf_tracing gateway as well. We might want to make this an interactive option later on, although I'd not complicate it unnecessarily: enabling BPF_SYSCALL is enough of an indicator that the user wants BPF support. Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Steven Rostedt Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- init/Kconfig | 2 +- kernel/trace/Kconfig | 8 ++++++++ kernel/trace/Makefile | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/init/Kconfig b/init/Kconfig index f5dbc6d4261b..2b4d055aca4a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1513,7 +1513,7 @@ config EVENTFD # syscall, maps, verifier config BPF_SYSCALL - bool "Enable bpf() system call" if EXPERT + bool "Enable bpf() system call" select ANON_INODES select BPF default n diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a5da09c899dd..c8e53c051293 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -432,6 +432,14 @@ config UPROBE_EVENT This option is required if you plan to use perf-probe subcommand of perf tools on user space applications. +config BPF_EVENTS + depends on BPF_SYSCALL + depends on KPROBE_EVENT + bool + default y + help + This allows the user to attach BPF programs to kprobe events. + config PROBE_EVENTS def_bool n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c575a300103b..9b1044e936a6 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,7 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o -obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o +obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM),y) -- cgit v1.2.3 From 00ccbf2f5b7580cd7dcdaeda84828d14f0cba3c9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 19 Feb 2015 15:56:14 +0100 Subject: ftrace/x86: Let dynamic trampolines call ops->func even for dynamic fops Dynamically allocated trampolines call ftrace_ops_get_func to get the function which they should call. For dynamic fops (FTRACE_OPS_FL_DYNAMIC flag is set) ftrace_ops_list_func is always returned. This is reasonable for static trampolines but goes against the main advantage of dynamic ones, that is avoidance of going through the list of all registered callbacks for functions that are only being traced by a single callback. We can fix it by returning ops->func (or recursion safe version) from ftrace_ops_get_func whenever it is possible for dynamic trampolines. Note that dynamic trampolines are not allowed for dynamic fops if CONFIG_PREEMPT=y. Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1501291023000.25445@pobox.suse.cz Link: http://lkml.kernel.org/r/1424357773-13536-1-git-send-email-mbenes@suse.cz Reported-by: Miroslav Benes Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f228024055b..d01d238d8ef4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -249,6 +249,19 @@ static void update_function_graph_func(void); static inline void update_function_graph_func(void) { } #endif + +static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) +{ + /* + * If this is a dynamic ops or we force list func, + * then it needs to call the list anyway. + */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) + return ftrace_ops_list_func; + + return ftrace_ops_get_func(ops); +} + static void update_ftrace_function(void) { ftrace_func_t func; @@ -270,7 +283,7 @@ static void update_ftrace_function(void) * then have the mcount trampoline call the function directly. */ } else if (ftrace_ops_list->next == &ftrace_list_end) { - func = ftrace_ops_get_func(ftrace_ops_list); + func = ftrace_ops_get_list_func(ftrace_ops_list); } else { /* Just use the default ftrace_ops */ @@ -5208,13 +5221,6 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, */ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) { - /* - * If this is a dynamic ops or we force list func, - * then it needs to call the list anyway. - */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) - return ftrace_ops_list_func; - /* * If the func handles its own recursion, call it directly. * Otherwise call the recursion protected function that -- cgit v1.2.3 From 0c564a538aa934ad15b2145aaf8b64f3feb0be63 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 24 Mar 2015 17:58:09 -0400 Subject: tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values Several tracepoints use the helper functions __print_symbolic() or __print_flags() and pass in enums that do the mapping between the binary data stored and the value to print. This works well for reading the ASCII trace files, but when the data is read via userspace tools such as perf and trace-cmd, the conversion of the binary value to a human string format is lost if an enum is used, as userspace does not have access to what the ENUM is. For example, the tracepoint trace_tlb_flush() has: __print_symbolic(REC->reason, { TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" }, { TLB_REMOTE_SHOOTDOWN, "remote shootdown" }, { TLB_LOCAL_SHOOTDOWN, "local shootdown" }, { TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" }) Which maps the enum values to the strings they represent. But perf and trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would not be able to map it. With TRACE_DEFINE_ENUM(), developers can place these in the event header files and ftrace will convert the enums to their values: By adding: TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH); TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN); TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN); TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN); $ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format [...] __print_symbolic(REC->reason, { 0, "flush on task switch" }, { 1, "remote shootdown" }, { 2, "local shootdown" }, { 3, "local mm shootdown" }) The above is what userspace expects to see, and tools do not need to be modified to parse them. Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org Cc: Guilherme Cox Cc: Tony Luck Cc: Xie XiuQi Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- include/asm-generic/vmlinux.lds.h | 5 +- include/linux/ftrace_event.h | 2 +- include/linux/tracepoint.h | 8 +++ include/trace/ftrace.h | 22 ++++++- kernel/trace/trace.c | 26 ++++++++- kernel/trace/trace.h | 2 + kernel/trace/trace_events.c | 119 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 178 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index ac78910d7416..f8e8b34dc427 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -124,7 +124,10 @@ #define FTRACE_EVENTS() . = ALIGN(8); \ VMLINUX_SYMBOL(__start_ftrace_events) = .; \ *(_ftrace_events) \ - VMLINUX_SYMBOL(__stop_ftrace_events) = .; + VMLINUX_SYMBOL(__stop_ftrace_events) = .; \ + VMLINUX_SYMBOL(__start_ftrace_enum_maps) = .; \ + *(_ftrace_enum_map) \ + VMLINUX_SYMBOL(__stop_ftrace_enum_maps) = .; #else #define FTRACE_EVENTS() #endif diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 62b8fac7ded5..112cf49d9576 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -285,7 +285,7 @@ struct ftrace_event_call { struct tracepoint *tp; }; struct trace_event event; - const char *print_fmt; + char *print_fmt; struct event_filter *filter; void *mod; void *data; diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index c72851328ca9..a5f7f3ecafa3 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -36,6 +36,12 @@ struct tracepoint { struct tracepoint_func __rcu *funcs; }; +struct trace_enum_map { + const char *system; + const char *enum_string; + unsigned long enum_value; +}; + extern int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data); extern int @@ -87,6 +93,8 @@ extern void syscall_unregfunc(void); #define PARAMS(args...) args +#define TRACE_DEFINE_ENUM(x) + #endif /* _LINUX_TRACEPOINT_H */ /* diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 2f9b95b6d3fb..37d4b10b111d 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -33,6 +33,19 @@ TRACE_MAKE_SYSTEM_STR(); +#undef TRACE_DEFINE_ENUM +#define TRACE_DEFINE_ENUM(a) \ + static struct trace_enum_map __used __initdata \ + __##TRACE_SYSTEM##_##a = \ + { \ + .system = TRACE_SYSTEM_STRING, \ + .enum_string = #a, \ + .enum_value = a \ + }; \ + static struct trace_enum_map __used \ + __attribute__((section("_ftrace_enum_map"))) \ + *TRACE_SYSTEM##_##a = &__##TRACE_SYSTEM##_##a + /* * DECLARE_EVENT_CLASS can be used to add a generic function * handlers for events. That is, if all events have the same @@ -136,6 +149,9 @@ TRACE_MAKE_SYSTEM_STR(); * The size of an array is also encoded, in the higher 16 bits of . */ +#undef TRACE_DEFINE_ENUM +#define TRACE_DEFINE_ENUM(a) + #undef __field #define __field(type, item) @@ -553,7 +569,7 @@ static inline notrace int ftrace_get_offsets_##call( \ * .trace = ftrace_raw_output_, <-- stage 2 * }; * - * static const char print_fmt_[] = ; + * static char print_fmt_[] = ; * * static struct ftrace_event_class __used event_class_