From d24d7dbf3cc49b00a152e55e24f0eeb173c7a971 Mon Sep 17 00:00:00 2001 From: Jovi Zhang Date: Wed, 18 Jul 2012 18:16:44 +0800 Subject: tracing: Verify target file before registering a uprobe event Without this patch, we can register a uprobe event for a directory. Enabling such a uprobe event would anyway fail. Example: $ echo 'p /bin:0x4245c0' > /sys/kernel/debug/tracing/uprobe_events However dirctories cannot be valid targets for uprobe. Hence verify if the target is a regular file during the probe registration. Link: http://lkml.kernel.org/r/20130103004212.690763002@goodmis.org Cc: Namhyung Kim Signed-off-by: Jovi Zhang Acked-by: Srikar Dronamraju [ cleaned up whitespace and removed redundant IS_DIR() check ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/trace/trace_uprobe.c') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c86e6d4f67fb..87b6db4ccbc5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -258,6 +258,10 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; inode = igrab(path.dentry->d_inode); + if (!S_ISREG(inode->i_mode)) { + ret = -EINVAL; + goto fail_address_parse; + } argc -= 2; argv += 2; @@ -356,7 +360,7 @@ fail_address_parse: if (inode) iput(inode); - pr_info("Failed to parse address.\n"); + pr_info("Failed to parse address or file.\n"); return ret; } -- cgit v1.2.3 From fe20d71f25400cccc8bffef865f79250be7dbc81 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 21 Nov 2012 17:32:30 +0100 Subject: uprobes: Kill uprobe_consumer->filter() uprobe_consumer->filter() is pointless in its current form, kill it. We will add it back, but with the different signature/semantics. Perhaps we will even re-introduce the callsite in handler_chain(), but not to just skip uc->handler(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- include/linux/uprobes.h | 5 ----- kernel/events/uprobes.c | 6 ++---- kernel/trace/trace_uprobe.c | 1 - 3 files changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel/trace/trace_uprobe.c') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 4f628a6fc5b4..83742b91ff73 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -37,11 +37,6 @@ struct inode; struct uprobe_consumer { int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); - /* - * filter is optional; If a filter exists, handler is run - * if and only if filter returns true. - */ - bool (*filter)(struct uprobe_consumer *self, struct task_struct *task); struct uprobe_consumer *next; }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a39d8163b713..5cbebac27c01 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -477,10 +477,8 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) return; down_read(&uprobe->consumer_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { - if (!uc->filter || uc->filter(uc, current)) - uc->handler(uc, regs); - } + for (uc = uprobe->consumers; uc; uc = uc->next) + uc->handler(uc, regs); up_read(&uprobe->consumer_rwsem); } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 87b6db4ccbc5..e668024773d4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -550,7 +550,6 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) return -EINTR; utc->cons.handler = uprobe_dispatcher; - utc->cons.filter = NULL; ret = uprobe_register(tu->inode, tu->offset, &utc->cons); if (ret) { kfree(utc); -- cgit v1.2.3 From 74e59dfc6b19e3472a7c16ad57bc831e6e647895 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Dec 2012 15:54:08 +0100 Subject: uprobes: Change handle_swbp() to expose bp_vaddr to handler_chain() Change handle_swbp() to set regs->ip = bp_vaddr in advance, this is what consumer->handler() needs but uprobe_get_swbp_addr() is not exported. This also simplifies the code and makes it more consistent across the supported architectures. handle_swbp() becomes the only caller of uprobe_get_swbp_addr(). Signed-off-by: Oleg Nesterov Acked-by: Ananth N Mavinakayanahalli --- arch/x86/kernel/uprobes.c | 1 - kernel/events/uprobes.c | 15 +++++++-------- kernel/trace/trace_uprobe.c | 4 ++-- 3 files changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel/trace/trace_uprobe.c') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 4e33a35d659e..0ba4cfb4f412 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -681,7 +681,6 @@ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) continue; if (auprobe->insn[i] == 0x90) { - regs->ip = uprobe_get_swbp_addr(regs); regs->ip += i + 1; return true; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 04c104ad9522..f1b807831fc2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1504,6 +1504,10 @@ static void handle_swbp(struct pt_regs *regs) } return; } + + /* change it in advance for ->handler() and restart */ + instruction_pointer_set(regs, bp_vaddr); + /* * TODO: move copy_insn/etc into _register and remove this hack. * After we hit the bp, _unregister + _register can install the @@ -1511,14 +1515,14 @@ static void handle_swbp(struct pt_regs *regs) */ smp_rmb(); /* pairs with wmb() in install_breakpoint() */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) - goto restart; + goto out; utask = current->utask; if (!utask) { utask = add_utask(); /* Cannot allocate; re-execute the instruction. */ if (!utask) - goto restart; + goto out; } handler_chain(uprobe, regs); @@ -1531,12 +1535,7 @@ static void handle_swbp(struct pt_regs *regs) return; } -restart: - /* - * cannot singlestep; cannot skip instruction; - * re-execute the instruction. - */ - instruction_pointer_set(regs, bp_vaddr); + /* can_skip_sstep() succeeded, or restart if can't singlestep */ out: put_uprobe(uprobe); } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e668024773d4..17d9b2bcc28d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -492,7 +492,7 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) return; entry = ring_buffer_event_data(event); - entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); + entry->ip = instruction_pointer(task_pt_regs(current)); data = (u8 *)&entry[1]; for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); @@ -667,7 +667,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) if (!entry) goto out; - entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); + entry->ip = instruction_pointer(task_pt_regs(current)); data = (u8 *)&entry[1]; for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); -- cgit v1.2.3 From 84d7ed799fd6c1366547d88ddb8188c65de3b94f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 27 Jan 2013 18:20:45 +0100 Subject: uprobes/tracing: Fix dentry/mount leak in create_trace_uprobe() create_trace_uprobe() does kern_path() to find ->d_inode, but forgets to do path_put(). We can do this right after igrab(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel/trace/trace_uprobe.c') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 17d9b2bcc28d..06c22bad776a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -253,16 +253,18 @@ static int create_trace_uprobe(int argc, char **argv) if (ret) goto fail_address_parse; - ret = kstrtoul(arg, 0, &offset); - if (ret) - goto fail_address_parse; - inode = igrab(path.dentry->d_inode); + path_put(&path); + if (!S_ISREG(inode->i_mode)) { ret = -EINVAL; goto fail_address_parse; } + ret = kstrtoul(arg, 0, &offset); + if (ret) + goto fail_address_parse; + argc -= 2; argv += 2; -- cgit v1.2.3 From 4161824f18ff4f56f46595a4016c7315dd0d24f1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 27 Jan 2013 18:36:24 +0100 Subject: uprobes/tracing: Fully initialize uprobe_trace_consumer before uprobe_register() probe_event_enable() does uprobe_register() and only after that sets utc->tu and tu->consumer/flags. This can race with uprobe_dispatcher() which can miss these assignments or see them out of order. Nothing really bad can happen, but this doesn't look clean/safe. And this does not allow to use uprobe_consumer->filter() we are going to add, it is called by uprobe_register() and it needs utc->tu. Change this code to initialize everything before uprobe_register(), and reset tu->consumer/flags if it fails. We can't race with event_disable(), the caller holds event_mutex, and if we could the code would be wrong anyway. In fact I think uprobe_trace_consumer should die, it buys nothing but complicates the code. We can simply add uprobe_consumer into trace_uprobe. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel/trace/trace_uprobe.c') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 06c22bad776a..15b8eceeddc5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -552,17 +552,18 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) return -EINTR; utc->cons.handler = uprobe_dispatcher; + utc->tu = tu; + tu->consumer = utc; + tu->flags |= flag; + ret = uprobe_register(tu->inode, tu->offset, &utc->cons); if (ret) { + tu->consumer = NULL; + tu->flags &= ~flag; kfree(utc); - return ret; } - tu->flags |= flag; - utc->tu = tu; - tu->consumer = utc; - - return 0; + return ret; } static void probe_event_disable(struct trace_uprobe *tu, int flag) -- cgit v1.2.3 From 7e4e28c53963e6cfa94d8109bb8f5233c5659048 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 28 Jan 2013 17:08:47 +0100 Subject: uprobes/tracing: Ensure inode != NULL in create_trace_uprobe() probe_event_enable/disable() check tu->inode != NULL at the start. This is ugly, if igrab() can fail create_trace_uprobe() should not succeed and "postpone" the failure. And S_ISREG(inode->i_mode) check added by d24d7dbf is not safe. Note: alloc_uprobe() should probably check igrab() != NULL as well. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/trace/trace_uprobe.c') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 15b8eceeddc5..f7838cfd61b9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -256,7 +256,7 @@ static int create_trace_uprobe(int argc, char **argv) inode = igrab(path.dentry->d_inode); path_put(&path); - if (!S_ISREG(inode->i_mode)) { + if (!inode || !S_ISREG(inode->i_mode)) { ret = -EINVAL; goto fail_address_parse; } @@ -544,7 +544,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) struct uprobe_trace_consumer *utc; int ret = 0; - if (!tu->inode || tu->consumer) + if (tu->consumer) return -EINTR; utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); @@ -568,7 +568,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) static void probe_event_disable(struct trace_uprobe *tu, int flag) { - if (!tu->inode || !tu->consumer) + if (!tu->consumer) return; uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); -- cgit v1.2.3 From b64b007797c1e6d6b745c93c296ba1d5f4d72d86 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Jan 2013 19:15:30 +0100 Subject: uprobes/tracing: Introduce is_trace_uprobe_enabled() probe_event_enable/disable() check tu->consumer != NULL to avoid the wrong uprobe_register/unregister(). We are going to kill this pointer and "struct uprobe_trace_consumer", so we add the new helper, is_trace_uprobe_enabled(), which can rely on TP_FLAG_TRACE/TP_FLAG_PROFILE instead. Note: the current logic doesn't look optimal, it is not clear why TP_FLAG_TRACE/TP_FLAG_PROFILE are mutually exclusive, we will probably change this later. Also kill the unused TP_FLAG_UPROBE. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_probe.h | 1 - kernel/trace/trace_uprobe.c | 9 +++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/trace/trace_uprobe.c') diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 933708677814..5c7e09d10d74 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -66,7 +66,6 @@ #define TP_FLAG_TRACE 1 #define TP_FLAG_PROFILE 2 #define TP_FLAG_REGISTERED 4 -#define TP_FLAG_UPROBE 8 /* data_rloc: data relative location, compatible with u32 */ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f7838cfd61b9..d6c6e2a345a7 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -539,12 +539,17 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } +static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) +{ + return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); +} + static int probe_event_enable(struct trace_uprobe *tu, int flag) { struct uprobe_trace_consumer *utc; int ret = 0; - if (tu->consumer) + if (is_trace_uprobe_enabled(tu)) return -EINTR; utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); @@ -568,7 +573,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) static void probe_event_disable(struct trace_uprobe *tu, int flag) { - if (!tu->consumer) + if (!is_trace_uprobe_enabled(tu)) return; uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); -- cgit v1.2.3 From a932b7381f81235530c3d0acbd3ba2c7537d78e5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Jan 2013 19:47:23 +0100 Subject: uprobes/tracing: Kill uprobe_trace_consumer, embed uprobe_consumer into trace_uprobe trace_uprobe->consumer and "struct uprobe_trace_consumer" add the unnecessary indirection and complicate the code for no reason. This patch simply embeds uprobe_consumer into "struct trace_uprobe", all other changes only fix the compilation errors. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 35 ++++++----------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) (limited to 'kernel/trace/trace_uprobe.c') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d6c6e2a345a7..9c8babbfd11b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -31,17 +31,11 @@ /* * uprobe event core functions */ -struct trace_uprobe; -struct uprobe_trace_consumer { - struct uprobe_consumer cons; - struct trace_uprobe *tu; -}; - struct trace_uprobe { struct list_head list; struct ftrace_event_class class; struct ftrace_event_call call; - struct uprobe_trace_consumer *consumer; + struct uprobe_consumer consumer; struct inode *inode; char *filename; unsigned long offset; @@ -92,6 +86,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) goto error; INIT_LIST_HEAD(&tu->list); + tu->consumer.handler = uprobe_dispatcher; return tu; error: @@ -546,27 +541,15 @@ static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) static int probe_event_enable(struct trace_uprobe *tu, int flag) { - struct uprobe_trace_consumer *utc; int ret = 0; if (is_trace_uprobe_enabled(tu)) return -EINTR; - utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); - if (!utc) - return -EINTR; - - utc->cons.handler = uprobe_dispatcher; - utc->tu = tu; - tu->consumer = utc; tu->flags |= flag; - - ret = uprobe_register(tu->inode, tu->offset, &utc->cons); - if (ret) { - tu->consumer = NULL; + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + if (ret) tu->flags &= ~flag; - kfree(utc); - } return ret; } @@ -576,10 +559,8 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag) if (!is_trace_uprobe_enabled(tu)) return; - uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->flags &= ~flag; - kfree(tu->consumer); - tu->consumer = NULL; } static int uprobe_event_define_fields(struct ftrace_event_call *event_call) @@ -717,13 +698,9 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { - struct uprobe_trace_consumer *utc; struct trace_uprobe *tu; - utc = container_of(con, struct uprobe_trace_consumer, cons); - tu = utc->tu; - if (!tu || tu->consumer != utc) - return 0; + tu = container_of(con, struct trace_uprobe, consumer); if (tu->flags & TP_FLAG_TRACE) uprobe_trace_func(tu, regs); -- cgit v1.2.3 From 1b47aefd9b6bd439a4be43c47acd22987ac22db8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Jan 2013 19:55:27 +0100 Subject: uprobes/perf: Always increment trace_uprobe->nhit Move tu->nhit++ from uprobe_trace_func() to uprobe_dispatcher(). ->nhit counts how many time we hit the breakpoint inserted by this uprobe, we do not want to loose this info if uprobe was enabled by sys_perf_event_open(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/trace/trace_uprobe.c') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9c8babbfd11b..c4e29e19fdd7 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -476,8 +476,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) unsigned long irq_flags; struct ftrace_event_call *call = &tu->call; - tu->nhit++; - local_save_flags(irq_flags); pc = preempt_count(); @@ -701,6 +699,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) struct trace_uprobe *tu; tu = container_of(con, struct trace_uprobe, consumer); + tu->nhit++; if (tu->flags & TP_FLAG_TRACE) uprobe_trace_func(tu, regs); -- cgit v1.2.3 From 736288ba5016e255869c26296014eeff649971c2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 3 Feb 2013 20:58:35 +0100 Subject: uprobes/perf: Teach trace_uprobe/perf code to track the active perf_event's Introduce "struct trace_uprobe_filter" which records the "active" perf_event's attached to ftrace_event_call. For the start we simply use list_head, we can optimize this later if needed. For example, we do not really need to record an event with ->parent != NULL, we can rely on parent->child_list. And we can certainly do some optimizations for the case when 2 events have the same ->tp_target or tp_target->mm. Change trace_uprobe_register() to process TRACE_REG_PERF_OPEN/CLOSE and add/del this perf_event to the list. We can probably avoid any locking, but lets start with the "obvioulsy correct" trace_uprobe_filter->rwlock which protects everything. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 55 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'kernel/trace/trace_uprobe.c') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c4e29e19fdd7..2a74a93afdae 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -28,6 +28,12 @@ #define UPROBE_EVENT_SYSTEM "uprobes" +struct trace_uprobe_filter { + rwlock_t rwlock; + int nr_systemwide; + struct list_head perf_events; +}; + /* * uprobe event core functions */ @@ -35,6 +41,7 @@ struct trace_uprobe { struct list_head list; struct ftrace_event_class class; struct ftrace_event_call call; + struct trace_uprobe_filter filter; struct uprobe_consumer consumer; struct inode *inode; char *filename; @@ -58,6 +65,18 @@ static LIST_HEAD(uprobe_list); static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) +{ + rwlock_init(&filter->rwlock); + filter->nr_systemwide = 0; + INIT_LIST_HEAD(&filter->perf_events); +} + +static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) +{ + return !filter->nr_systemwide && list_empty(&filter->perf_events); +} + /* * Allocate new trace_uprobe and initialize it (including uprobes). */ @@ -87,6 +106,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) INIT_LIST_HEAD(&tu->list); tu->consumer.handler = uprobe_dispatcher; + init_trace_uprobe_filter(&tu->filter); return tu; error: @@ -544,6 +564,8 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) if (is_trace_uprobe_enabled(tu)) return -EINTR; + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + tu->flags |= flag; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) @@ -557,6 +579,8 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag) if (!is_trace_uprobe_enabled(tu)) return; + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->flags &= ~flag; } @@ -632,6 +656,30 @@ static int set_print_fmt(struct trace_uprobe *tu) } #ifdef CONFIG_PERF_EVENTS +static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +{ + write_lock(&tu->filter.rwlock); + if (event->hw.tp_target) + list_add(&event->hw.tp_list, &tu->filter.perf_events); + else + tu->filter.nr_systemwide++; + write_unlock(&tu->filter.rwlock); + + return 0; +} + +static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +{ + write_lock(&tu->filter.rwlock); + if (event->hw.tp_target) + list_del(&event->hw.tp_list); + else + tu->filter.nr_systemwide--; + write_unlock(&tu->filter.rwlock); + + return 0; +} + /* uprobe profile handler */ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { @@ -687,6 +735,13 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, case TRACE_REG_PERF_UNREGISTER: probe_event_disable(tu, TP_FLAG_PROFILE); return 0; + + case TRACE_REG_PERF_OPEN: + return uprobe_perf_open(tu, data); + + case TRACE_REG_PERF_CLOSE: + return uprobe_perf_close(tu, data); + #endif default: return 0; -- cgit v1.2.3 From 31ba334836c0ac0039084859f14a5b96858493dc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2013 17:11:58 +0100 Subject: uprobes/perf: Teach trace_uprobe/perf code to pre-filter Finally implement uprobe_perf_filter() which checks ->nr_systemwide or ->perf_events to figure out whether we need to insert the breakpoint. uprobe_perf_open/close are changed to do uprobe_apply(true/false) when the new perf event comes or goes away. Note that currently this is very suboptimal: - uprobe_register() called by TRACE_REG_PERF_REGISTER becomes a heavy nop, consumer->filter() always returns F at this stage. As it was already discussed we need uprobe_register_only() to avoid the costly register_for_each_vma() when possible. - uprobe_apply() is oftenly overkill. Unless "nr_systemwide != 0" changes we need uprobe_apply_mm(), unapply_uprobe() is almost what we need. - uprobe_apply() can be simply avoided sometimes, see the next changes. Testing: # perf probe -x /lib/libc.so.6 syscall # perl -e 'syscall -1 while 1' & [1] 530 # perf record -e probe_libc:syscall perl -e 'syscall -1 for 1..10; sleep 1' # perf report --show-total-period 100.00% 10 perl libc-2.8.so [.] syscall Before this patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 79291 A huge ->nrhit == 79291 reflects the fact that the background process 530 constantly hits this breakpoint too, even if doesn't contribute to the output. After the patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 10 This shows that only the target process was punished by int3. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 46 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) (limited to 'kernel/trace/trace_uprobe.c') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2a74a93afdae..b7850f535acf 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -557,7 +557,12 @@ static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); } -static int probe_event_enable(struct trace_uprobe *tu, int flag) +typedef bool (*filter_func_t)(struct uprobe_consumer *self, + enum uprobe_filter_ctx ctx, + struct mm_struct *mm); + +static int +probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) { int ret = 0; @@ -567,6 +572,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) WARN_ON(!uprobe_filter_is_empty(&tu->filter)); tu->flags |= flag; + tu->consumer.filter = filter; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) tu->flags &= ~flag; @@ -656,6 +662,22 @@ static int set_print_fmt(struct trace_uprobe *tu) } #ifdef CONFIG_PERF_EVENTS +static bool +__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) +{ + struct perf_event *event; + + if (filter->nr_systemwide) + return true; + + list_for_each_entry(event, &filter->perf_events, hw.tp_list) { + if (event->hw.tp_target->mm == mm) + return true; + } + + return false; +} + static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { write_lock(&tu->filter.rwlock); @@ -665,6 +687,8 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) tu->filter.nr_systemwide++; write_unlock(&tu->filter.rwlock); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + return 0; } @@ -677,9 +701,25 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) tu->filter.nr_systemwide--; write_unlock(&tu->filter.rwlock); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + return 0; } +static bool uprobe_perf_filter(struct uprobe_consumer *uc, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ + struct trace_uprobe *tu; + int ret; + + tu = container_of(uc, struct trace_uprobe, consumer); + read_lock(&tu->filter.rwlock); + ret = __uprobe_perf_filter(&tu->filter, mm); + read_unlock(&tu->filter.rwlock); + + return ret; +} + /* uprobe profile handler */ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { @@ -722,7 +762,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, switch (type) { case TRACE_REG_REGISTER: - return probe_event_enable(tu, TP_FLAG_TRACE); + return probe_event_enable(tu, TP_FLAG_TRACE, NULL); case TRACE_REG_UNREGISTER: probe_event_disable(tu, TP_FLAG_TRACE); @@ -730,7 +770,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return probe_event_enable(tu, TP_FLAG_PROFILE); + return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter); case TRACE_REG_PERF_UNREGISTER: probe_event_disable(tu, TP_FLAG_PROFILE); -- cgit v1.2.3 From f42d24a1d20d2e72d1e5d48930f18b138dfad117 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2013 17:48:34 +0100 Subject: uprobes/perf: Teach trace_uprobe/perf code to use UPROBE_HANDLER_REMOVE Change uprobe_trace_func() and uprobe_perf_func() to return "int". Change uprobe_dispatcher() to return "trace_ret | perf_ret" although this is not needed, currently TP_FLAG_TRACE/TP_FLAG_PROFILE are mutually exclusive. The only functional change is that uprobe_perf_func() checks the filtering too and returns UPROBE_HANDLER_REMOVE if nobody wants to trace current. Testing: # perf probe -x /lib/libc.so.6 syscall # perf record -e probe_libc:syscall -i perl -e 'fork; syscall -1 for 1..10; wait' # perf report --show-total-period 100.00% 10 perl libc-2.8.so [.] syscall Before this patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 20 A child process doesn't have a counter, but still it hits this breakoint "copied" by dup_mmap(). After the patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 11 The child process hits this int3 only once and does unapply_uprobe(). Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel/trace/trace_uprobe.c') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b7850f535acf..2399f1416555 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -486,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = { }; /* uprobe handler */ -static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; @@ -504,7 +504,7 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); if (!event) - return; + return 0; entry = ring_buffer_event_data(event); entry->ip = instruction_pointer(task_pt_regs(current)); @@ -514,6 +514,8 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) if (!filter_current_check_discard(buffer, call, entry, event)) trace_buffer_unlock_commit(buffer, event, irq_flags, pc); + + return 0; } /* Event entry printers */ @@ -721,7 +723,7 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, } /* uprobe profile handler */ -static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { struct ftrace_event_call *call = &tu->call; struct uprobe_trace_entry_head *entry; @@ -730,11 +732,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) int size, __size, i; int rctx; + if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + return UPROBE_HANDLER_REMOVE; + __size = sizeof(*entry) + tu->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) - return; + return 0; preempt_disable(); @@ -752,6 +757,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) out: preempt_enable(); + return 0; } #endif /* CONFIG_PERF_EVENTS */ @@ -792,18 +798,19 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { struct trace_uprobe *tu; + int ret = 0; tu = container_of(con, struct trace_uprobe, consumer); tu->nhit++; if (tu->flags & TP_FLAG_TRACE) - uprobe_trace_func(tu, regs); + ret |= uprobe_trace_func(tu, regs); #ifdef CONFIG_PERF_EVENTS if (tu->flags & TP_FLAG_PROFILE) - uprobe_perf_func(tu, regs); + ret |= uprobe_perf_func(tu, regs); #endif - return 0; + return ret; } static struct trace_event_functions uprobe_funcs = { -- cgit v1.2.3 From b2fe8ba674e8acbb9e8e63510b802c6d054d88a3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2013 19:05:43 +0100 Subject: uprobes/perf: Avoid uprobe_apply() whenever possible uprobe_perf_open/close call the costly uprobe_apply() every time, we can avoid it if: - "nr_systemwide != 0" is not changed. - There is another process/thread with the same ->mm. - copy_proccess() does inherit_event(). dup_mmap() preserves the inserted breakpoints. - event->attr.enable_on_exec == T, we can rely on uprobe_mmap() called by exec/mmap paths. - tp_target is exiting. Only _close() checks PF_EXITING, I don't think TRACE_REG_PERF_OPEN can hit the dying task too often. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) (limited to 'kernel/trace/trace_uprobe.c') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2399f1416555..8dad2a92dee9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -680,30 +680,60 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) return false; } +static inline bool +uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) +{ + return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); +} + static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { + bool done; + write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) + if (event->hw.tp_target) { + /* + * event->parent != NULL means copy_process(), we can avoid + * uprobe_apply(). current->mm must be probed and we can rely + * on dup_mmap() which preserves the already installed bp's. + * + * attr.enable_on_exec means that exec/mmap will install the + * breakpoints we need. + */ + done = tu->filter.nr_systemwide || + event->parent || event->attr.enable_on_exec || + uprobe_filter_event(tu, event); list_add(&event->hw.tp_list, &tu->filter.perf_events); - else + } else { + done = tu->filter.nr_systemwide; tu->filter.nr_systemwide++; + } write_unlock(&tu->filter.rwlock); - uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + if (!done) + uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); return 0; } static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) { + bool done; + write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) + if (event->hw.tp_target) { list_del(&event->hw.tp_list); - else + done = tu->filter.nr_systemwide || + (event->hw.tp_target->flags & PF_EXITING) || + uprobe_filter_event(tu, event); + } else { tu->filter.nr_systemwide--; + done = tu->filter.nr_systemwide; + } write_unlock(&tu->filter.rwlock); - uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + if (!done) + uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); return 0; } -- cgit v1.2.3