diff options
Diffstat (limited to 'kernel/trace')
35 files changed, 1806 insertions, 681 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1052126bdca2..e9e95c790b8e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -51,6 +51,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS This allows for use of regs_get_kernel_argument() and kernel_stack_pointer(). +config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE + bool + help + If the architecture generates __patchable_function_entries sections + but does not want them included in the ftrace locations. + config HAVE_FTRACE_MCOUNT_RECORD bool help diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7f5eb295fe19..a995ea1ef849 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -346,8 +346,40 @@ static void put_probe_ref(void) mutex_unlock(&blk_probe_mutex); } +static int blk_trace_start(struct blk_trace *bt) +{ + if (bt->trace_state != Blktrace_setup && + bt->trace_state != Blktrace_stopped) + return -EINVAL; + + blktrace_seq++; + smp_mb(); + bt->trace_state = Blktrace_running; + raw_spin_lock_irq(&running_trace_lock); + list_add(&bt->running_list, &running_trace_list); + raw_spin_unlock_irq(&running_trace_lock); + trace_note_time(bt); + + return 0; +} + +static int blk_trace_stop(struct blk_trace *bt) +{ + if (bt->trace_state != Blktrace_running) + return -EINVAL; + + bt->trace_state = Blktrace_stopped; + raw_spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + raw_spin_unlock_irq(&running_trace_lock); + relay_flush(bt->rchan); + + return 0; +} + static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { + blk_trace_stop(bt); synchronize_rcu(); blk_trace_free(q, bt); put_probe_ref(); @@ -362,8 +394,7 @@ static int __blk_trace_remove(struct request_queue *q) if (!bt) return -EINVAL; - if (bt->trace_state != Blktrace_running) - blk_trace_cleanup(q, bt); + blk_trace_cleanup(q, bt); return 0; } @@ -658,7 +689,6 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, static int __blk_trace_startstop(struct request_queue *q, int start) { - int ret; struct blk_trace *bt; bt = rcu_dereference_protected(q->blk_trace, @@ -666,36 +696,10 @@ static int __blk_trace_startstop(struct request_queue *q, int start) if (bt == NULL) return -EINVAL; - /* - * For starting a trace, we can transition from a setup or stopped - * trace. For stopping a trace, the state must be running - */ - ret = -EINVAL; - if (start) { - if (bt->trace_state == Blktrace_setup || - bt->trace_state == Blktrace_stopped) { - blktrace_seq++; - smp_mb(); - bt->trace_state = Blktrace_running; - raw_spin_lock_irq(&running_trace_lock); - list_add(&bt->running_list, &running_trace_list); - raw_spin_unlock_irq(&running_trace_lock); - - trace_note_time(bt); - ret = 0; - } - } else { - if (bt->trace_state == Blktrace_running) { - bt->trace_state = Blktrace_stopped; - raw_spin_lock_irq(&running_trace_lock); - list_del_init(&bt->running_list); - raw_spin_unlock_irq(&running_trace_lock); - relay_flush(bt->rchan); - ret = 0; - } - } - - return ret; + if (start) + return blk_trace_start(bt); + else + return blk_trace_stop(bt); } int blk_trace_startstop(struct request_queue *q, int start) @@ -772,10 +776,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) void blk_trace_shutdown(struct request_queue *q) { if (rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->debugfs_mutex))) { - __blk_trace_startstop(q, 0); + lockdep_is_held(&q->debugfs_mutex))) __blk_trace_remove(q); - } } #ifdef CONFIG_BLK_CGROUP @@ -1614,13 +1616,7 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; - if (bt->trace_state == Blktrace_running) { - bt->trace_state = Blktrace_stopped; - raw_spin_lock_irq(&running_trace_lock); - list_del_init(&bt->running_list); - raw_spin_unlock_irq(&running_trace_lock); - relay_flush(bt->rchan); - } + blk_trace_stop(bt); put_probe_ref(); synchronize_rcu(); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 68e5cdd24cef..1ed08967fb97 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -20,6 +20,8 @@ #include <linux/fprobe.h> #include <linux/bsearch.h> #include <linux/sort.h> +#include <linux/key.h> +#include <linux/verification.h> #include <net/bpf_sk_storage.h> @@ -685,6 +687,7 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, perf_sample_data_init(sd, 0, 0); sd->raw = &raw; + sd->sample_flags |= PERF_SAMPLE_RAW; err = __bpf_perf_event_output(regs, map, flags, sd); @@ -743,6 +746,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); sd->raw = &raw; + sd->sample_flags |= PERF_SAMPLE_RAW; ret = __bpf_perf_event_output(regs, map, flags, sd); out: @@ -1026,11 +1030,30 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .arg1_type = ARG_PTR_TO_CTX, }; +#ifdef CONFIG_X86_KERNEL_IBT +static unsigned long get_entry_ip(unsigned long fentry_ip) +{ + u32 instr; + + /* Being extra safe in here in case entry ip is on the page-edge. */ + if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1)) + return fentry_ip; + if (is_endbr(instr)) + fentry_ip -= ENDBR_INSN_SIZE; + return fentry_ip; +} +#else +#define get_entry_ip(fentry_ip) fentry_ip +#endif + BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { struct kprobe *kp = kprobe_running(); - return kp ? (uintptr_t)kp->addr : 0; + if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY)) + return 0; + + return get_entry_ip((uintptr_t)kp->addr); } static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { @@ -1181,6 +1204,184 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +#ifdef CONFIG_KEYS +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "kfuncs which will be used in BPF programs"); + +/** + * bpf_lookup_user_key - lookup a key by its serial + * @serial: key handle serial number + * @flags: lookup-specific flags + * + * Search a key with a given *serial* and the provided *flags*. + * If found, increment the reference count of the key by one, and + * return it in the bpf_key structure. + * + * The bpf_key structure must be passed to bpf_key_put() when done + * with it, so that the key reference count is decremented and the + * bpf_key structure is freed. + * + * Permission checks are deferred to the time the key is used by + * one of the available key-specific kfuncs. + * + * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested + * special keyring (e.g. session keyring), if it doesn't yet exist. + * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting + * for the key construction, and to retrieve uninstantiated keys (keys + * without data attached to them). + * + * Return: a bpf_key pointer with a valid key pointer if the key is found, a + * NULL pointer otherwise. + */ +struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) +{ + key_ref_t key_ref; + struct bpf_key *bkey; + + if (flags & ~KEY_LOOKUP_ALL) + return NULL; + + /* + * Permission check is deferred until the key is used, as the + * intent of the caller is unknown here. + */ + key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); + if (IS_ERR(key_ref)) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); + if (!bkey) { + key_put(key_ref_to_ptr(key_ref)); + return NULL; + } + + bkey->key = key_ref_to_ptr(key_ref); + bkey->has_ref = true; + + return bkey; +} + +/** + * bpf_lookup_system_key - lookup a key by a system-defined ID + * @id: key ID + * + * Obtain a bpf_key structure with a key pointer set to the passed key ID. + * The key pointer is marked as invalid, to prevent bpf_key_put() from + * attempting to decrement the key reference count on that pointer. The key + * pointer set in such way is currently understood only by + * verify_pkcs7_signature(). + * + * Set *id* to one of the values defined in include/linux/verification.h: + * 0 for the primary keyring (immutable keyring of system keys); + * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring + * (where keys can be added only if they are vouched for by existing keys + * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform + * keyring (primarily used by the integrity subsystem to verify a kexec'ed + * kerned image and, possibly, the initramfs signature). + * + * Return: a bpf_key pointer with an invalid key pointer set from the + * pre-determined ID on success, a NULL pointer otherwise + */ +struct bpf_key *bpf_lookup_system_key(u64 id) +{ + struct bpf_key *bkey; + + if (system_keyring_id_check(id) < 0) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); + if (!bkey) + return NULL; + + bkey->key = (struct key *)(unsigned long)id; + bkey->has_ref = false; + + return bkey; +} + +/** + * bpf_key_put - decrement key reference count if key is valid and free bpf_key + * @bkey: bpf_key structure + * + * Decrement the reference count of the key inside *bkey*, if the pointer + * is valid, and free *bkey*. + */ +void bpf_key_put(struct bpf_key *bkey) +{ + if (bkey->has_ref) + key_put(bkey->key); + + kfree(bkey); +} + +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +/** + * bpf_verify_pkcs7_signature - verify a PKCS#7 signature + * @data_ptr: data to verify + * @sig_ptr: signature of the data + * @trusted_keyring: keyring with keys trusted for signature verification + * + * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* + * with keys in a keyring referenced by *trusted_keyring*. + * + * Return: 0 on success, a negative value on error. + */ +int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, + struct bpf_dynptr_kern *sig_ptr, + struct bpf_key *trusted_keyring) +{ + int ret; + + if (trusted_keyring->has_ref) { + /* + * Do the permission check deferred in bpf_lookup_user_key(). + * See bpf_lookup_user_key() for more details. + * + * A call to key_task_permission() here would be redundant, as + * it is already done by keyring_search() called by + * find_asymmetric_key(). + */ + ret = key_validate(trusted_keyring->key); + if (ret < 0) + return ret; + } + + return verify_pkcs7_signature(data_ptr->data, + bpf_dynptr_get_size(data_ptr), + sig_ptr->data, + bpf_dynptr_get_size(sig_ptr), + trusted_keyring->key, + VERIFYING_UNSPECIFIED_SIGNATURE, NULL, + NULL); +} +#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ + +__diag_pop(); + +BTF_SET8_START(key_sig_kfunc_set) +BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) +#endif +BTF_SET8_END(key_sig_kfunc_set) + +static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { + .owner = THIS_MODULE, + .set = &key_sig_kfunc_set, +}; + +static int __init bpf_key_sig_kfuncs_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &bpf_key_sig_kfunc_set); +} + +late_initcall(bpf_key_sig_kfuncs_init); +#endif /* CONFIG_KEYS */ + static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1507,6 +1708,9 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE)) return -EINVAL; + if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK))) + return -ENOENT; + if (unlikely(!br_stack)) return -ENOENT; @@ -2042,9 +2246,15 @@ static __always_inline void __bpf_trace_run(struct bpf_prog *prog, u64 *args) { cant_sleep(); + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + bpf_prog_inc_misses_counter(prog); + goto out; + } rcu_read_lock(); (void) bpf_prog_run(prog, args); rcu_read_unlock(); +out: + this_cpu_dec(*(prog->active)); } #define UNPACK(...) __VA_ARGS__ @@ -2414,13 +2624,13 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, } static void -kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip, +kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, struct pt_regs *regs) { struct bpf_kprobe_multi_link *link; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, entry_ip, regs); + kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); } static int symbols_cmp_r(const void *a, const void *b, const void *priv) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index aac63ca9c3d1..e8143e368074 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -141,6 +141,8 @@ static int fprobe_init_rethook(struct fprobe *fp, int num) return -E2BIG; fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler); + if (!fp->rethook) + return -ENOMEM; for (i = 0; i < size; i++) { struct fprobe_rethook_node *node; @@ -301,7 +303,8 @@ int unregister_fprobe(struct fprobe *fp) { int ret; - if (!fp || fp->ops.func != fprobe_handler) + if (!fp || (fp->ops.saved_func != fprobe_handler && + fp->ops.saved_func != fprobe_kprobe_handler)) return -EINVAL; /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bc921a3f7ea8..33236241f236 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1289,6 +1289,7 @@ static int ftrace_add_mod(struct trace_array *tr, if (!ftrace_mod) return -ENOMEM; + INIT_LIST_HEAD(&ftrace_mod->list); ftrace_mod->func = kstrdup(func, GFP_KERNEL); ftrace_mod->module = kstrdup(module, GFP_KERNEL); ftrace_mod->enable = enable; @@ -1644,6 +1645,18 @@ ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_ex static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); +static bool skip_record(struct dyn_ftrace *rec) +{ + /* + * At boot up, weak functions are set to disable. Function tracing + * can be enabled before they are, and they still need to be disabled now. + * If the record is disabled, still continue if it is marked as already + * enabled (this is needed to keep the accounting working). + */ + return rec->flags & FTRACE_FL_DISABLED && + !(rec->flags & FTRACE_FL_ENABLED); +} + static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1693,7 +1706,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int in_hash = 0; int match = 0; - if (rec->flags & FTRACE_FL_DISABLED) + if (skip_record(rec)) continue; if (all) { @@ -1861,8 +1874,6 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, ftrace_hash_rec_update_modify(ops, filter_hash, 1); } -static bool ops_references_ip(struct ftrace_ops *ops, unsigned long ip); - /* * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK * or no-needed to update, -EBUSY if it detects a conflict of the flag @@ -2018,7 +2029,6 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, static void print_ip_ins(const char *fmt, const unsigned char *p) { char ins[MCOUNT_INSN_SIZE]; - int i; if (copy_from_kernel_nofault(ins, p, MCOUNT_INSN_SIZE)) { printk(KERN_CONT "%s[FAULT] %px\n", fmt, p); @@ -2026,9 +2036,7 @@ static void print_ip_ins(const char *fmt, const unsigned char *p) } printk(KERN_CONT "%s", fmt); - - for (i = 0; i < MCOUNT_INSN_SIZE; i++) - printk(KERN_CONT "%s%02x", i ? ":" : "", ins[i]); + pr_cont("%*phC", MCOUNT_INSN_SIZE, ins); } enum ftrace_bug_type ftrace_bug_type; @@ -2128,7 +2136,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) ftrace_bug_type = FTRACE_BUG_UNKNOWN; - if (rec->flags & FTRACE_FL_DISABLED) + if (skip_record(rec)) return FTRACE_UPDATE_IGNORE; /* @@ -2243,7 +2251,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) if (update) { /* If there's no more users, clear all flags */ if (!ftrace_rec_count(rec)) - rec->flags = 0; + rec->flags &= FTRACE_FL_DISABLED; else /* * Just disable the record, but keep the ops TRAMP @@ -2636,7 +2644,7 @@ void __weak ftrace_replace_code(int mod_flags) do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_DISABLED) + if (skip_record(rec)) continue; failed = __ftrace_replace_code(rec, enable); @@ -2974,6 +2982,16 @@ int ftrace_startup(struct ftrace_ops *ops, int command) ftrace_startup_enable(command); + /* + * If ftrace is in an undefined state, we just remove ops from list + * to prevent the NULL pointer, instead of totally rolling it back and + * free trampoline, because those actions could cause further damage. + */ + if (unlikely(ftrace_disabled)) { + __unregister_ftrace_function(ops); + return -ENODEV; + } + ops->flags &= ~FTRACE_OPS_FL_ADDING; return 0; @@ -3011,18 +3029,8 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) command |= FTRACE_UPDATE_TRACE_FUNC; } - if (!command || !ftrace_enabled) { - /* - * If these are dynamic or per_cpu ops, they still - * need their data freed. Since, function tracing is - * not currently active, we can just free them - * without synchronizing all CPUs. - */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - goto free_ops; - - return 0; - } + if (!command || !ftrace_enabled) + goto out; /* * If the ops uses a trampoline, then it needs to be @@ -3059,6 +3067,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) removed_ops = NULL; ops->flags &= ~FTRACE_OPS_FL_REMOVING; +out: /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. @@ -3086,7 +3095,6 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) if (IS_ENABLED(CONFIG_PREEMPTION)) synchronize_rcu_tasks(); - free_ops: ftrace_trampoline_free(ops); } @@ -3108,49 +3116,6 @@ static inline int ops_traces_mod(struct ftrace_ops *ops) ftrace_hash_empty(ops->func_hash->notrace_hash); } -/* - * Check if the current ops references the given ip. - * - * If the ops traces all functions, then it was already accounted for. - * If the ops does not trace the current record function, skip it. - * If the ops ignores the function via notrace filter, skip it. - */ -static bool -ops_references_ip(struct ftrace_ops *ops, unsigned long ip) -{ - /* If ops isn't enabled, ignore it */ - if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return false; - - /* If ops traces all then it includes this function */ - if (ops_traces_mod(ops)) - return true; - - /* The function must be in the filter */ - if (!ftrace_hash_empty(ops->func_hash->filter_hash) && - !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) - return false; - - /* If in notrace hash, we ignore it too */ - if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) - return false; - - return true; -} - -/* - * Check if the current ops references the record. - * - * If the ops traces all functions, then it was already accounted for. - * If the ops does not trace the current record function, skip it. - * If the ops ignores the function via notrace filter, skip it. - */ -static bool -ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) -{ - return ops_references_ip(ops, rec->ip); -} - static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { bool init_nop = ftrace_need_init_nop(); @@ -3226,7 +3191,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) /* if we can't allocate this size, try something smaller */ if (!order) return -ENOMEM; - order >>= 1; + order--; goto again; } @@ -5462,6 +5427,8 @@ static struct ftrace_ops stub_ops = { * it is safe to modify the ftrace record, where it should be * currently calling @old_addr directly, to call @new_addr. * + * This is called with direct_mutex locked. + * * Safety checks should be made to make sure that the code at * @rec->ip is currently calling @old_addr. And this must * also update entry->direct to @new_addr. @@ -5474,6 +5441,8 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, unsigned long ip = rec->ip; int ret; + lockdep_assert_held(&direct_mutex); + /* * The ftrace_lock was used to determine if the record * had more than one registered user to it. If it did, @@ -5496,7 +5465,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, if (ret) goto out_lock; - ret = register_ftrace_function(&stub_ops); + ret = register_ftrace_function_nolock(&stub_ops); if (ret) { ftrace_set_filter_ip(&stub_ops, ip, 1, 0); goto out_lock; @@ -6116,8 +6085,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file) if (filter_hash) { orig_hash = &iter->ops->func_hash->filter_hash; - if (iter->tr && !list_empty(&iter->tr->mod_trace)) - iter->hash->flags |= FTRACE_HASH_FL_MOD; + if (iter->tr) { + if (list_empty(&iter->tr->mod_trace)) + iter->hash->flags &= ~FTRACE_HASH_FL_MOD; + else + iter->hash->flags |= FTRACE_HASH_FL_MOD; + } } else orig_hash = &iter->ops->func_hash->notrace_hash; @@ -6812,6 +6785,38 @@ static int ftrace_get_trampoline_kallsym(unsigned int symnum, return -ERANGE; } +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) || defined(CONFIG_MODULES) +/* + * Check if the current ops references the given ip. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static bool +ops_references_ip(struct ftrace_ops *ops, unsigned long ip) +{ + /* If ops isn't enabled, ignore it */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return false; + + /* If ops traces all then it includes this function */ + if (ops_traces_mod(ops)) + return true; + + /* The function must be in the filter */ + if (!ftrace_hash_empty(ops->func_hash->filter_hash) && + !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) + return false; + + /* If in notrace hash, we ignore it too */ + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) + return false; + + return true; +} +#endif + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -6824,7 +6829,7 @@ static int referenced_filters(struct dyn_ftrace *rec) int cnt = 0; for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) { + if (ops_references_ip(ops, rec->ip)) { if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT)) continue; if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY)) @@ -7387,7 +7392,7 @@ void __init ftrace_init(void) } pr_info("ftrace: allocating %ld entries in %ld pages\n", - count, count / ENTRIES_PER_PAGE + 1); + count, DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); ret = ftrace_process_locs(NULL, __start_mcount_loc, @@ -8268,8 +8273,7 @@ static int kallsyms_callback(void *data, const char *name, if (args->addrs[idx]) return 0; - addr = ftrace_location(addr); - if (!addr) + if (!ftrace_location(addr)) return 0; args->addrs[idx] = addr; diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c index 18b0f1cbb947..c736487fc0e4 100644 --- a/kernel/trace/kprobe_event_gen_test.c +++ b/kernel/trace/kprobe_event_gen_test.c @@ -35,6 +35,49 @@ static struct trace_event_file *gen_kprobe_test; static struct trace_event_file *gen_kretprobe_test; +#define KPROBE_GEN_TEST_FUNC "do_sys_open" + +/* X86 */ +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_32) +#define KPROBE_GEN_TEST_ARG0 "dfd=%ax" +#define KPROBE_GEN_TEST_ARG1 "filename=%dx" +#define KPROBE_GEN_TEST_ARG2 "flags=%cx" +#define KPROBE_GEN_TEST_ARG3 "mode=+4($stack)" + +/* ARM64 */ +#elif defined(CONFIG_ARM64) +#define KPROBE_GEN_TEST_ARG0 "dfd=%x0" +#define KPROBE_GEN_TEST_ARG1 "filename=%x1" +#define KPROBE_GEN_TEST_ARG2 "flags=%x2" +#define KPROBE_GEN_TEST_ARG3 "mode=%x3" + +/* ARM */ +#elif defined(CONFIG_ARM) +#define KPROBE_GEN_TEST_ARG0 "dfd=%r0" +#define KPROBE_GEN_TEST_ARG1 "filename=%r1" +#define KPROBE_GEN_TEST_ARG2 "flags=%r2" +#define KPROBE_GEN_TEST_ARG3 "mode=%r3" + +/* RISCV */ +#elif defined(CONFIG_RISCV) +#define KPROBE_GEN_TEST_ARG0 "dfd=%a0" +#define KPROBE_GEN_TEST_ARG1 "filename=%a1" +#define KPROBE_GEN_TEST_ARG2 "flags=%a2" +#define KPROBE_GEN_TEST_ARG3 "mode=%a3" + +/* others */ +#else +#define KPROBE_GEN_TEST_ARG0 NULL +#define KPROBE_GEN_TEST_ARG1 NULL +#define KPROBE_GEN_TEST_ARG2 NULL +#define KPROBE_GEN_TEST_ARG3 NULL +#endif + +static bool trace_event_file_is_valid(struct trace_event_file *input) +{ + return input && !IS_ERR(input); +} + /* * Test to make sure we can create a kprobe event, then add more * fields. @@ -58,23 +101,23 @@ static int __init test_gen_kprobe_cmd(void) * fields. */ ret = kprobe_event_gen_cmd_start(&cmd, "gen_kprobe_test", - "do_sys_open", - "dfd=%ax", "filename=%dx"); + KPROBE_GEN_TEST_FUNC, + KPROBE_GEN_TEST_ARG0, KPROBE_GEN_TEST_ARG1); if (ret) - goto free; + goto out; /* Use kprobe_event_add_fields to add the rest of the fields */ - ret = kprobe_event_add_fields(&cmd, "flags=%cx", "mode=+4($stack)"); + ret = kprobe_event_add_fields(&cmd, KPROBE_GEN_TEST_ARG2, KPROBE_GEN_TEST_ARG3); if (ret) - goto free; + goto out; /* * This actually creates the event. */ ret = kprobe_event_gen_cmd_end(&cmd); if (ret) - goto free; + goto out; /* * Now get the gen_kprobe_test event file. We need to prevent @@ -97,13 +140,13 @@ static int __init test_gen_kprobe_cmd(void) goto delete; } out: + kfree(buf); return ret; delete: + if (trace_event_file_is_valid(gen_kprobe_test)) + gen_kprobe_test = NULL; /* We got an error after creating the event, delete it */ ret = kprobe_event_delete("gen_kprobe_test"); - free: - kfree(buf); - goto out; } @@ -128,17 +171,17 @@ static int __init test_gen_kretprobe_cmd(void) * Define the kretprobe event. */ ret = kretprobe_event_gen_cmd_start(&cmd, "gen_kretprobe_test", - "do_sys_open", + KPROBE_GEN_TEST_FUNC, "$retval"); if (ret) - goto free; + goto out; /* * This actually creates the event. */ ret = kretprobe_event_gen_cmd_end(&cmd); if (ret) - goto free; + goto out; /* * Now get the gen_kretprobe_test event file. We need to @@ -162,13 +205,13 @@ static int __init test_gen_kretprobe_cmd(void) goto delete; } out: + kfree(buf); return ret; delete: + if (trace_event_file_is_valid(gen_kretprobe_test)) + gen_kretprobe_test = NULL; /* We got an error after creating the event, delete it */ ret = kprobe_event_delete("gen_kretprobe_test"); - free: - kfree(buf); - goto out; } @@ -182,10 +225,12 @@ static int __init kprobe_event_gen_test_init(void) ret = test_gen_kretprobe_cmd(); if (ret) { - WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr, - "kprobes", - "gen_kretprobe_test", false)); - trace_put_event_file(gen_kretprobe_test); + if (trace_event_file_is_valid(gen_kretprobe_test)) { + WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr, + "kprobes", + "gen_kretprobe_test", false)); + trace_put_event_file(gen_kretprobe_test); + } WARN_ON(kprobe_event_delete("gen_kretprobe_test")); } @@ -194,24 +239,30 @@ static int __init kprobe_event_gen_test_init(void) static void __exit kprobe_event_gen_test_exit(void) { - /* Disable the event or you can't remove it */ - WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr, - "kprobes", - "gen_kprobe_test", false)); + if (trace_event_file_is_valid(gen_kprobe_test)) { + /* Disable the event or you can't remove it */ + WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr, + "kprobes", + "gen_kprobe_test", false)); + + /* Now give the file and instance back */ + trace_put_event_file(gen_kprobe_test); + } - /* Now give the file and instance back */ - trace_put_event_file(gen_kprobe_test); /* Now unregister and free the event */ WARN_ON(kprobe_event_delete("gen_kprobe_test")); - /* Disable the event or you can't remove it */ - WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr, - "kprobes", - "gen_kretprobe_test", false)); + if (trace_event_file_is_valid(gen_kretprobe_test)) { + /* Disable the event or you can't remove it */ + WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr, + "kprobes", + "gen_kretprobe_test", false)); + + /* Now give the file and instance back */ + trace_put_event_file(gen_kretprobe_test); + } - /* Now give the file and instance back */ - trace_put_event_file(gen_kretprobe_test); /* Now unregister and free the event */ WARN_ON(kprobe_event_delete("gen_kretprobe_test")); diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index c69d82273ce7..32c3dfdb4d6a 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -83,8 +83,10 @@ struct rethook *rethook_alloc(void *data, rethook_handler_t handler) { struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL); - if (!rh || !handler) + if (!rh || !handler) { + kfree(rh); return NULL; + } rh->data = data; rh->handler = handler; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d59b6a328b7f..b21bf14bae9b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -413,6 +413,7 @@ struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; + long wait_index; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; @@ -518,6 +519,7 @@ struct ring_buffer_per_cpu { local_t committing; local_t commits; local_t pages_touched; + local_t pages_lost; local_t pages_read; long last_pages_touch; size_t shortest_full; @@ -884,7 +886,7 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu) } /** - * ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer + * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from * @cpu: The cpu of the ring_buffer to get the number of pages from * @@ -893,10 +895,18 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu) size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu) { size_t read; + size_t lost; size_t cnt; read = local_read(&buffer->buffers[cpu]->pages_read); + lost = local_read(&buffer->buffers[cpu]->pages_lost); cnt = local_read(&buffer->buffers[cpu]->pages_touched); + + if (WARN_ON_ONCE(cnt < lost)) + return 0; + + cnt -= lost; + /* The reader can read an empty page, but not more than that */ if (cnt < read) { WARN_ON_ONCE(read > cnt + 1); @@ -906,6 +916,21 @@ size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu) return cnt - read; } +static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + size_t nr_pages; + size_t dirty; + + nr_pages = cpu_buffer->nr_pages; + if (!nr_pages || !full) + return true; + + dirty = ring_buffer_nr_dirty_pages(buffer, cpu); + + return (dirty * 100) > (full * nr_pages); +} + /* * rb_wake_up_waiters - wake up tasks waiting for ring buffer input * @@ -917,13 +942,56 @@ static void rb_wake_up_waiters(struct irq_work *work) struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); wake_up_all(&rbwork->waiters); - if (rbwork->wakeup_full) { + if (rbwork->full_waiters_pending || rbwork->wakeup_full) { rbwork->wakeup_full = false; + rbwork->full_waiters_pending = false; wake_up_all(&rbwork->full_waiters); } } /** + * ring_buffer_wake_waiters - wake up any waiters on this ring buffer + * @buffer: The ring buffer to wake waiters on + * + * In the case of a file that represents a ring buffer is closing, + * it is prudent to wake up any waiters that are on this. + */ +void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct rb_irq_work *rbwork; + + if (!buffer) + return; + + if (cpu == RING_BUFFER_ALL_CPUS) { + + /* Wake up individual ones too. One level recursion */ + for_each_buffer_cpu(buffer, cpu) + ring_buffer_wake_waiters(buffer, cpu); + + rbwork = &buffer->irq_work; + } else { + if (WARN_ON_ONCE(!buffer->buffers)) + return; + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + return; + + cpu_buffer = buffer->buffers[cpu]; + /* The CPU buffer may not have been initialized yet */ + if (!cpu_buffer) + return; + rbwork = &cpu_buffer->irq_work; + } + + rbwork->wait_index++; + /* make sure the waiters see the new index */ + smp_wmb(); + + rb_wake_up_waiters(&rbwork->work); +} + +/** * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on @@ -938,6 +1006,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) struct ring_buffer_per_cpu *cpu_buffer; DEFINE_WAIT(wait); struct rb_irq_work *work; + long wait_index; int ret = 0; /* @@ -956,6 +1025,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) work = &cpu_buffer->irq_work; } + wait_index = READ_ONCE(work->wait_index); while (true) { if (full) @@ -1000,26 +1070,29 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) !ring_buffer_empty_cpu(buffer, cpu)) { unsigned long flags; bool pagebusy; - size_t nr_pages; - size_t dirty; + bool done; if (!full) break; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - nr_pages = cpu_buffer->nr_pages; - dirty = ring_buffer_nr_dirty_pages(buffer, cpu); + done = !pagebusy && full_hit(buffer, cpu, full); + if (!cpu_buffer->shortest_full || - cpu_buffer->shortest_full < full) + cpu_buffer->shortest_full > full) cpu_buffer->shortest_full = full; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (!pagebusy && - (!nr_pages || (dirty * 100) > full * nr_pages)) + if (done) break; } schedule(); + + /* Make sure to see the new wait index */ + smp_rmb(); + if (wait_index != work->wait_index) + break; } if (full) @@ -1036,6 +1109,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) * @cpu: the cpu buffer to wait on * @filp: the file descriptor * @poll_table: The poll descriptor + * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise @@ -1045,14 +1119,15 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) * zero otherwise. */ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, - struct file *filp, poll_table *poll_table) + struct file *filp, poll_table *poll_table, int full) { struct ring_buffer_per_cpu *cpu_buffer; struct rb_irq_work *work; - if (cpu == RING_BUFFER_ALL_CPUS) + if (cpu == RING_BUFFER_ALL_CPUS) { work = &buffer->irq_work; - else { + full = 0; + } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; @@ -1060,8 +1135,14 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, work = &cpu_buffer->irq_work; } - poll_wait(filp, &work->waiters, poll_table); - work->waiters_pending = true; + if (full) { + poll_wait(filp, &work->full_waiters, poll_table); + work->full_waiters_pending = true; + } else { + poll_wait(filp, &work->waiters, poll_table); + work->waiters_pending = true; + } + /* * There's a tight race between setting the waiters_pending and * checking if the ring buffer is empty. Once the waiters_pending bit @@ -1077,6 +1158,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, */ smp_mb(); + if (full) + return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0; + if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) return EPOLLIN | EPOLLRDNORM; @@ -1718,9 +1802,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) free_buffer_page(cpu_buffer->reader_page); - rb_head_page_deactivate(cpu_buffer); - if (head) { + rb_head_page_deactivate(cpu_buffer); + list_for_each_entry_safe(bpage, tmp, head, list) { list_del_init(&bpage->list); free_buffer_page(bpage); @@ -1956,6 +2040,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) */ local_add(page_entries, &cpu_buffer->overrun); local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_inc(&cpu_buffer->pages_lost); } /* @@ -2440,6 +2525,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, */ local_add(entries, &cpu_buffer->overrun); local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_inc(&cpu_buffer->pages_lost); /* * The entries will be zeroed out when we move the @@ -2608,6 +2694,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, /* Mark the rest of the page with padding */ rb_event_set_padding(event); + /* Make sure the padding is visible before the write update */ + smp_wmb(); + /* Set the write back to the previous setting */ local_sub(length, &tail_page->write); return; @@ -2619,6 +2708,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, /* time delta must be non zero */ event->time_delta = 1; + /* Make sure the padding is visible before the tail_page->write update */ + smp_wmb(); + /* Set write to end of buffer */ length = (tail + length) - BUF_PAGE_SIZE; local_sub(length, &tail_page->write); @@ -3098,10 +3190,6 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, static __always_inline void rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { - size_t nr_pages; - size_t dirty; - size_t full; - if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ @@ -3125,10 +3213,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched); - full = cpu_buffer->shortest_full; - nr_pages = cpu_buffer->nr_pages; - dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu); - if (full && nr_pages && (dirty * 100) <= full * nr_pages) + if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full)) return; cpu_buffer->irq_work.wakeup_full = true; @@ -4587,6 +4672,33 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); + /* + * The writer has preempt disable, wait for it. But not forever + * Although, 1 second is pretty much "forever" + */ +#define USECS_WAIT 1000000 + for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) { + /* If the write is past the end of page, a writer is still updating it */ + if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE)) + break; + + udelay(1); + + /* Get the latest version of the reader write value */ + smp_rmb(); + } + + /* The writer is not moving forward? Something is wrong */ + if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT)) + reader = NULL; + + /* + * Make sure we see any padding after the write update + * (see rb_reset_tail()) + */ + smp_rmb(); + + return reader; } @@ -5164,6 +5276,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); local_set(&cpu_buffer->pages_touched, 0); + local_set(&cpu_buffer->pages_lost, 0); local_set(&cpu_buffer->pages_read, 0); cpu_buffer->last_pages_touch = 0; cpu_buffer->shortest_full = 0; @@ -5232,7 +5345,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /** - * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer + * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of * @cpu: The CPU buffer to be reset */ @@ -5302,7 +5415,7 @@ void ring_buffer_reset(struct trace_buffer *buffer) EXPORT_SYMBOL_GPL(ring_buffer_reset); /** - * rind_buffer_empty - is the ring buffer empty? + * ring_buffer_empty - is the ring buffer empty? * @buffer: The ring buffer to test */ bool ring_buffer_empty(struct trace_buffer *buffer) @@ -5616,7 +5729,15 @@ int ring_buffer_read_page(struct trace_buffer *buffer, unsigned int pos = 0; unsigned int size; - if (full) + /* + * If a full page is expected, this can still be returned + * if there's been a previous partial read and the + * rest of the page can be read and the commit page is off + * the reader page. + */ + if (full && + (!read || (len < (commit - read)) || + cpu_buffer->reader_page == cpu_buffer->commit_page)) goto out_unlock; if (len > (commit - read)) diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index 83cace53b9fa..b2b49a27e886 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -16,7 +16,7 @@ #include "wip.h" -struct rv_monitor rv_wip; +static struct rv_monitor rv_wip; DECLARE_DA_MON_PER_CPU(wip, unsigned char); static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) @@ -60,7 +60,7 @@ static void disable_wip(void) da_monitor_destroy_wip(); } -struct rv_monitor rv_wip = { +static struct rv_monitor rv_wip = { .name = "wip", .description = "wakeup in preemptive per-cpu testing monitor.", .enable = enable_wip, @@ -69,13 +69,13 @@ struct rv_monitor rv_wip = { .enabled = 0, }; -static int register_wip(void) +static int __init register_wip(void) { rv_register_monitor(&rv_wip); return 0; } -static void unregister_wip(void) +static void __exit unregister_wip(void) { rv_unregister_monitor(&rv_wip); } diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index c1c47e2305ef..dacc37b62a2c 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -27,7 +27,7 @@ struct automaton_wip { bool final_states[state_max_wip]; }; -struct automaton_wip automaton_wip = { +static struct automaton_wip automaton_wip = { .state_names = { "preemptive", "non_preemptive" diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 599225d9cf38..0e43dd2db685 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -15,7 +15,7 @@ #include "wwnr.h" -struct rv_monitor rv_wwnr; +static struct rv_monitor rv_wwnr; DECLARE_DA_MON_PER_TASK(wwnr, unsigned char); static void handle_switch(void *data, bool preempt, struct task_struct *p, @@ -59,7 +59,7 @@ static void disable_wwnr(void) da_monitor_destroy_wwnr(); } -struct rv_monitor rv_wwnr = { +static struct rv_monitor rv_wwnr = { .name = "wwnr", .description = "wakeup while not running per-task testing model.", .enable = enable_wwnr, @@ -68,13 +68,13 @@ struct rv_monitor rv_wwnr = { .enabled = 0, }; -static int register_wwnr(void) +static int __init register_wwnr(void) { rv_register_monitor(&rv_wwnr); return 0; } -static void unregister_wwnr(void) +static void __exit unregister_wwnr(void) { rv_unregister_monitor(&rv_wwnr); } diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index d1afe55cdd4c..118e576b91b4 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -27,7 +27,7 @@ struct automaton_wwnr { bool final_states[state_max_wwnr]; }; -struct automaton_wwnr automaton_wwnr = { +static struct automaton_wwnr automaton_wwnr = { .state_names = { "not_running", "running" diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c index b698d05dd069..d65f6c25a87c 100644 --- a/kernel/trace/rv/reactor_panic.c +++ b/kernel/trace/rv/reactor_panic.c @@ -24,13 +24,13 @@ static struct rv_reactor rv_panic = { .react = rv_panic_reaction }; -static int register_react_panic(void) +static int __init register_react_panic(void) { rv_register_reactor(&rv_panic); return 0; } -static void unregister_react_panic(void) +static void __exit unregister_react_panic(void) { rv_unregister_reactor(&rv_panic); } diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c index 31899f953af4..4b6b7106a477 100644 --- a/kernel/trace/rv/reactor_printk.c +++ b/kernel/trace/rv/reactor_printk.c @@ -23,13 +23,13 @@ static struct rv_reactor rv_printk = { .react = rv_printk_reaction }; -static int register_react_printk(void) +static int __init register_react_printk(void) { rv_register_reactor(&rv_printk); return 0; } -static void unregister_react_printk(void) +static void __exit unregister_react_printk(void) { rv_unregister_reactor(&rv_printk); } diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 0b15e975d2c2..8d77526892f4 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -120,15 +120,13 @@ static int __init test_gen_synth_cmd(void) /* Now generate a gen_synth_test event */ ret = synth_event_trace_array(gen_synth_test, vals, ARRAY_SIZE(vals)); - out: + free: + kfree(buf); return ret; delete: /* We got an error after creating the event, delete it */ synth_event_delete("gen_synth_test"); - free: - kfree(buf); - - goto out; + goto free; } /* @@ -227,15 +225,13 @@ static int __init test_empty_synth_event(void) /* Now trace an empty_synth_test event */ ret = synth_event_trace_array(empty_synth_test, vals, ARRAY_SIZE(vals)); - out: + free: + kfree(buf); return ret; delete: /* We got an error after creating the event, delete it */ synth_event_delete("empty_synth_test"); - free: - kfree(buf); - - goto out; + goto free; } static struct synth_field_desc create_synth_test_fields[] = { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d3005279165d..a7fe0e115272 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1193,12 +1193,14 @@ void *tracing_cond_snapshot_data(struct trace_array *tr) { void *cond_data = NULL; + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) cond_data = tr->cond_snapshot->cond_data; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); return cond_data; } @@ -1334,9 +1336,11 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, goto fail_unlock; } + local_irq_disable(); arch_spin_lock(&tr->max_lock); tr->cond_snapshot = cond_snapshot; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); mutex_unlock(&trace_types_lock); @@ -1363,6 +1367,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) { int ret = 0; + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (!tr->cond_snapshot) @@ -1373,6 +1378,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) } arch_spin_unlock(&tr->max_lock); + local_irq_enable(); return ret; } @@ -2200,6 +2206,11 @@ static size_t tgid_map_max; #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX +/* + * Preemption must be disabled before acquiring trace_cmdline_lock. + * The various trace_arrays' max_lock must be acquired in a context + * where interrupt is disabled. + */ static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; struct saved_cmdlines_buffer { unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; @@ -2412,7 +2423,11 @@ static int trace_save_cmdline(struct task_struct *tsk) * the lock, but we also don't want to spin * nor do we want to disable interrupts, * so if we miss here, then better luck next time. + * + * This is called within the scheduler and wake up, so interrupts + * had better been disabled and run queue lock been held. */ + lockdep_assert_preemption_disabled(); if (!arch_spin_trylock(&trace_cmdline_lock)) return 0; @@ -5890,9 +5905,11 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, char buf[64]; int r; + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -5917,10 +5934,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val) return -ENOMEM; } + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); savedcmd_temp = savedcmd; savedcmd = s; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); free_saved_cmdlines_buffer(savedcmd_temp); return 0; @@ -6373,10 +6392,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) #ifdef CONFIG_TRACER_SNAPSHOT if (t->use_max_tr) { + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) ret = -EBUSY; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); if (ret) goto out; } @@ -6407,12 +6428,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (tr->current_trace->reset) tr->current_trace->reset(tr); +#ifdef CONFIG_TRACER_MAX_TRACE + had_max_tr = tr->current_trace->use_max_tr; + /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; -#ifdef CONFIG_TRACER_MAX_TRACE - had_max_tr = tr->allocated_snapshot; - if (had_max_tr && !t->use_max_tr) { /* * We need to make sure that the update_max_tr sees that @@ -6425,11 +6446,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) free_snapshot(tr); } - if (t->use_max_tr && !had_max_tr) { + if (t->use_max_tr && !tr->allocated_snapshot) { ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) goto out; } +#else + tr->current_trace = &nop_trace; #endif if (t->init) { @@ -6634,6 +6657,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_unlock(&trace_types_lock); free_cpumask_var(iter->started); + kfree(iter->fmt); mutex_destroy(&iter->mutex); kfree(iter); @@ -6658,7 +6682,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl return EPOLLIN | EPOLLRDNORM; else return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file, - filp, poll_table); + filp, poll_table, iter->tr->buffer_percent); } static __poll_t @@ -7436,10 +7460,12 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, goto out; } + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) ret = -EBUSY; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); if (ret) goto out; @@ -7777,6 +7803,7 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr, int len) { struct tracing_log_err *err; + char *cmd; if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { err = alloc_tracing_log_err(len); @@ -7785,12 +7812,12 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr, return err; } - + cmd = kzalloc(len, GFP_KERNEL); + if (!cmd) + return ERR_PTR(-ENOMEM); err = list_first_entry(&tr->err_log, struct tracing_log_err, list); kfree(err->cmd); - err->cmd = kzalloc(len, GFP_KERNEL); - if (!err->cmd) - return ERR_PTR(-ENOMEM); + err->cmd = cmd; list_del(&err->list); return err; @@ -8137,6 +8164,12 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) __trace_array_put(iter->tr); + iter->wait_index++; + /* Make sure the waiters see the new wait_index */ + smp_wmb(); + + ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); + if (info->spare) ring_buffer_free_read_page(iter->array_buffer->buffer, info->spare_cpu, info->spare); @@ -8290,6 +8323,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { + long wait_index; + if (ret) goto out; @@ -8297,10 +8332,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) goto out; + wait_index = READ_ONCE(iter->wait_index); + ret = wait_on_pipe(iter, iter->tr->buffer_percent); if (ret) goto out; + /* No need to wait after waking up when tracing is off */ + if (!tracer_tracing_is_on(iter->tr)) + goto out; + + /* Make sure we see the new wait_index */ + smp_rmb(); + if (wait_index != iter->wait_index) + goto out; + goto again; } @@ -8311,12 +8357,34 @@ out: return ret; } +/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */ +static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + + if (cmd) + return -ENOIOCTLCMD; + + mutex_lock(&trace_types_lock); + + iter->wait_index++; + /* Make sure the waiters see the new wait_index */ + smp_wmb(); + + ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); + + mutex_unlock(&trace_types_lock); + return 0; +} + static const struct file_operations tracing_buffers_fops = { .open = tracing_buffers_open, .read = tracing_buffers_read, .poll = tracing_buffers_poll, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, + .unlocked_ioctl = tracing_buffers_ioctl, .llseek = no_llseek, }; @@ -9005,6 +9073,8 @@ rb_simple_write(struct file *filp, const char __user *ubuf, tracer_tracing_off(tr); if (tr->current_trace->stop) tr->current_trace->stop(tr); + /* Wake up any waiters */ + ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS); } mutex_unlock(&trace_types_lock); } @@ -10091,7 +10161,7 @@ __init static int tracer_alloc_buffers(void) * buffer. The memory will be removed once the "instance" is removed. */ ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE, - "trace/RB:preapre", trace_rb_cpu_prepare, + "trace/RB:prepare", trace_rb_cpu_prepare, NULL); if (ret < 0) goto out_free_cpumask; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 900e75d96c84..54ee5711c729 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1435,8 +1435,6 @@ event_trigger_unlock_commit(struct trace_event_file *file, struct filter_pred; struct regex; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); - typedef int (*regex_match_func)(char *str, struct regex *r, int len); enum regex_type { @@ -1455,17 +1453,6 @@ struct regex { regex_match_func match; }; -struct filter_pred { - filter_pred_fn_t fn; - u64 val; - struct regex regex; - unsigned short *ops; - struct ftrace_event_field *field; - int offset; - int not; - int op; -}; - static inline bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 801c2a7f7605..54d5fa35c90a 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -51,7 +51,7 @@ static void trace_do_benchmark(void) local_irq_disable(); start = trace_clock_local(); - trace_benchmark_event(bm_str); + trace_benchmark_event(bm_str, bm_last); stop = trace_clock_local(); local_irq_enable(); diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h index 79e6fbe5b365..c3e91060dc94 100644 --- a/kernel/trace/trace_benchmark.h +++ b/kernel/trace/trace_benchmark.h @@ -14,19 +14,21 @@ extern void trace_benchmark_unreg(void); TRACE_EVENT_FN(benchmark_event, - TP_PROTO(const char *str), + TP_PROTO(const char *str, u64 delta), - TP_ARGS(str), + TP_ARGS(str, delta), TP_STRUCT__entry( __array( char, str, BENCHMARK_EVENT_STRLEN ) + __field( u64, delta) ), TP_fast_assign( memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN); + __entry->delta = delta; ), - TP_printk("%s", __entry->str), + TP_printk("%s delta=%llu", __entry->str, __entry->delta), trace_benchmark_reg, trace_benchmark_unreg ); diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 4a0e9d927443..352b65e2b910 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -16,6 +16,7 @@ #include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" +#include "trace_probe_kernel.h" #define EPROBE_EVENT_SYSTEM "eprobes" @@ -26,6 +27,9 @@ struct trace_eprobe { /* tracepoint event */ const char *event_name; + /* filter string for the tracepoint */ + char *filter_str; + struct trace_event_call *event; struct dyn_event devent; @@ -48,6 +52,7 @@ static void trace_event_probe_cleanup(struct trace_eprobe *ep) kfree(ep->event_system); if (ep->event) trace_event_put_ref(ep->event); + kfree(ep->filter_str); kfree(ep); } @@ -227,6 +232,7 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) struct probe_arg *parg = &ep->tp.args[i]; struct ftrace_event_field *field; struct list_head *head; + int ret = -ENOENT; head = trace_get_fields(ep->event); list_for_each_entry(field, head, link) { @@ -236,9 +242,20 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) return 0; } } + + /* + * Argument not found on event. But allow for comm and COMM + * to be used to get the current->comm. + */ + if (strcmp(parg->code->data, "COMM") == 0 || + strcmp(parg->code->data, "comm") == 0) { + parg->code->op = FETCH_OP_COMM; + ret = 0; + } + kfree(parg->code->data); parg->code->data = NULL; - return -ENOENT; + return ret; } static int eprobe_event_define_fields(struct trace_event_call *event_call) @@ -311,6 +328,27 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec) addr = rec + field->offset; + if (is_string_field(field)) { + switch (field->filter_type) { + case FILTER_DYN_STRING: + val = (unsigned long)(rec + (*(unsigned int *)addr & 0xffff)); + break; + case FILTER_RDYN_STRING: + val = (unsigned long)(addr + (*(unsigned int *)addr & 0xffff)); + break; + case FILTER_STATIC_STRING: + val = (unsigned long)addr; + break; + case FILTER_PTR_STRING: + val = (unsigned long)(*(char *)addr); + break; + default: + WARN_ON_ONCE(1); + return 0; + } + return val; + } + switch (field->size) { case 1: if (field->is_signed) @@ -342,16 +380,38 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec) static int get_eprobe_size(struct trace_probe *tp, void *rec) { + struct fetch_insn *code; struct probe_arg *arg; int i, len, ret = 0; for (i = 0; i < tp->nr_args; i++) { arg = tp->args + i; - if (unlikely(arg->dynamic)) { + if (arg->dynamic) { unsigned long val; - val = get_event_field(arg->code, rec); - len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL); + code = arg->code; + retry: + switch (code->op) { + case FETCH_OP_TP_ARG: + val = get_event_field(code, rec); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + continue; + } + code++; + len = process_fetch_insn_bottom(code, val, NULL, NULL); if (len > 0) ret += len; } @@ -369,8 +429,28 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, { unsigned long val; - val = get_event_field(code, rec); - return process_fetch_insn_bottom(code + 1, val, dest, base); + retry: + switch (code->op) { + case FETCH_OP_TP_ARG: + val = get_event_field(code, rec); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + return -EILSEQ; + } + code++; + return process_fetch_insn_bottom(code, val, dest, base); } NOKPROBE_SYMBOL(process_fetch_insn) @@ -378,29 +458,14 @@ NOKPROBE_SYMBOL(process_fetch_insn) static nokprobe_inline int fetch_store_strlen_user(unsigned long addr) { - const void __user *uaddr = (__force const void __user *)addr; - - return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); + return kern_fetch_store_strlen_user(addr); } /* Return the length of string -- including null terminal byte */ static nokprobe_inline int fetch_store_strlen(unsigned long addr) { - int ret, len = 0; - u8 c; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if (addr < TASK_SIZE) - return fetch_store_strlen_user(addr); -#endif - - do { - ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - - return (ret < 0) ? ret : len; + return kern_fetch_store_strlen(addr); } /* @@ -410,21 +475,7 @@ fetch_store_strlen(unsigned long addr) static nokprobe_inline int fetch_store_string_user(unsigned long addr, void *dest, void *base) { - const void __user *uaddr = (__force const void __user *)addr; - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string_user(addr, dest, base); } /* @@ -434,29 +485,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) static nokprobe_inline int fetch_store_string(unsigned long addr, void *dest, void *base) { - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if ((unsigned long)addr < TASK_SIZE) - return fetch_store_string_user(addr, dest, base); -#endif - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string(addr, dest, base); } static nokprobe_inline int @@ -535,6 +564,9 @@ static void eprobe_trigger_func(struct event_trigger_data *data, { struct eprobe_data *edata = data->private_data; + if (unlikely(!rec)) + return; + __eprobe_trace_func(edata, rec); } @@ -589,14 +621,15 @@ static struct event_trigger_data * new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) { struct event_trigger_data *trigger; + struct event_filter *filter = NULL; struct eprobe_data *edata; + int ret; edata = kzalloc(sizeof(*edata), GFP_KERNEL); trigger = kzalloc(sizeof(*trigger), GFP_KERNEL); if (!trigger || !edata) { - kfree(edata); - kfree(trigger); - return ERR_PTR(-ENOMEM); + ret = -ENOMEM; + goto error; } trigger->flags = EVENT_TRIGGER_FL_PROBE; @@ -611,13 +644,25 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) trigger->cmd_ops = &event_trigger_cmd; INIT_LIST_HEAD(&trigger->list); - RCU_INIT_POINTER(trigger->filter, NULL); + + if (ep->filter_str) { + ret = create_event_filter(file->tr, ep->event, + ep->filter_str, false, &filter); + if (ret) + goto error; + } + RCU_INIT_POINTER(trigger->filter, filter); edata->file = file; edata->ep = ep; trigger->private_data = edata; return trigger; +error: + free_event_filter(filter); + kfree(edata); + kfree(trigger); + return ERR_PTR(ret); } static int enable_eprobe(struct trace_eprobe *ep, @@ -651,6 +696,7 @@ static int disable_eprobe(struct trace_eprobe *ep, { struct event_trigger_data *trigger = NULL, *iter; struct trace_event_file *file; + struct event_filter *filter; struct eprobe_data *edata; file = find_event_file(tr, ep->event_system, ep->event_name); @@ -677,6 +723,10 @@ static int disable_eprobe(struct trace_eprobe *ep, /* Make sure nothing is using the edata or trigger */ tracepoint_synchronize_unregister(); + filter = rcu_access_pointer(trigger->filter); + + if (filter) + free_event_filter(filter); kfree(edata); kfree(trigger); @@ -845,6 +895,60 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[ trace_probe_log_err(0, BAD_ATTACH_ARG); } + /* Handle symbols "@" */ + if (!ret) + ret = traceprobe_update_arg(&ep->tp.args[i]); + + return ret; +} + +static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[]) +{ + struct event_filter *dummy = NULL; + int i, ret, len = 0; + char *p; + + if (argc == 0) { + trace_probe_log_err(0, NO_EP_FILTER); + return -EINVAL; + } + + /* Recover the filter string */ + for (i = 0; i < argc; i++) + len += strlen(argv[i]) + 1; + + ep->filter_str = kzalloc(len, GFP_KERNEL); + if (!ep->filter_str) + return -ENOMEM; + + p = ep->filter_str; + for (i = 0; i < argc; i++) { + ret = snprintf(p, len, "%s ", argv[i]); + if (ret < 0) + goto error; + if (ret > len) { + ret = -E2BIG; + goto error; + } + p += ret; + len -= ret; + } + p[-1] = '\0'; + + /* + * Ensure the filter string can be parsed correctly. Note, this + * filter string is for the original event, not for the eprobe. + */ + ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str, + true, &dummy); + free_event_filter(dummy); + if (ret) + goto error; + + return 0; +error: + kfree(ep->filter_str); + ep->filter_str = NULL; return ret; } @@ -852,8 +956,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) { /* * Argument syntax: - * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] - * Fetch args: + * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] [if FILTER] + * Fetch args (no space): * <name>=$<field>[:TYPE] */ const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; @@ -863,8 +967,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) char buf1[MAX_EVENT_NAME_LEN]; char buf2[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; - int ret = 0; - int i; + int ret = 0, filter_idx = 0; + int i, filter_cnt; if (argc < 2 || argv[0][0] != 'e') return -ECANCELED; @@ -883,17 +987,25 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(1); sys_event = argv[1]; ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); - if (!sys_event || !sys_name) { + if (ret || !sys_event || !sys_name) { trace_probe_log_err(0, NO_EVENT_INFO); goto parse_error; } if (!event) { - strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); - sanitize_event_name(buf1); + strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN); event = buf1; } + for (i = 2; i < argc; i++) { + if (!strcmp(argv[i], "if")) { + filter_idx = i + 1; + filter_cnt = argc - filter_idx; + argc = i; + break; + } + } + mutex_lock(&event_mutex); event_call = find_and_get_event(sys_name, sys_event); ep = alloc_event_probe(group, event, event_call, argc - 2); @@ -909,6 +1021,14 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } + if (filter_idx) { + trace_probe_log_set_index(filter_idx); + ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx); + if (ret) + goto parse_error; + } else + ep->filter_str = NULL; + argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a114549720d6..61e3a2620fa3 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -157,7 +157,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event) int i; if (--tp_event->perf_refcount > 0) - goto out; + return; tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); @@ -176,8 +176,6 @@ static void perf_trace_event_unreg(struct perf_event *p_event) perf_trace_buf[i] = NULL; } } -out: - trace_event_put_ref(tp_event); } static int perf_trace_event_open(struct perf_event *p_event) @@ -241,6 +239,7 @@ void perf_trace_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); } @@ -292,6 +291,7 @@ void perf_kprobe_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); destroy_local_trace_kprobe(p_event->tp_event); @@ -347,6 +347,7 @@ void perf_uprobe_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); destroy_local_trace_uprobe(p_event->tp_event); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 181f08186d32..0356cae0cf74 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -176,6 +176,7 @@ static int trace_define_generic_fields(void) __generic_field(int, CPU, FILTER_CPU); __generic_field(int, cpu, FILTER_CPU); + __generic_field(int, common_cpu, FILTER_CPU); __generic_field(char *, COMM, FILTER_COMM); __generic_field(char *, comm, FILTER_COMM); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 4b1057ab9d96..96acc2b71ac7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -43,6 +43,42 @@ enum filter_op_ids { OPS }; static const char * ops[] = { OPS }; +enum filter_pred_fn { + FILTER_PRED_FN_NOP, + FILTER_PRED_FN_64, + FILTER_PRED_FN_S64, + FILTER_PRED_FN_U64, + FILTER_PRED_FN_32, + FILTER_PRED_FN_S32, + FILTER_PRED_FN_U32, + FILTER_PRED_FN_16, + FILTER_PRED_FN_S16, + FILTER_PRED_FN_U16, + FILTER_PRED_FN_8, + FILTER_PRED_FN_S8, + FILTER_PRED_FN_U8, + FILTER_PRED_FN_COMM, + FILTER_PRED_FN_STRING, + FILTER_PRED_FN_STRLOC, + FILTER_PRED_FN_STRRELLOC, + FILTER_PRED_FN_PCHAR_USER, + FILTER_PRED_FN_PCHAR, + FILTER_PRED_FN_CPU, + FILTER_PRED_FN_, + FILTER_PRED_TEST_VISITED, +}; + +struct filter_pred { + enum filter_pred_fn fn_num; + u64 val; + struct regex regex; + unsigned short *ops; + struct ftrace_event_field *field; + int offset; + int not; + int op; +}; + /* * pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND * pred_funcs_##type below must match the order of them above. @@ -590,44 +626,48 @@ out_free: return ERR_PTR(ret); } +enum pred_cmp_types { + PRED_CMP_TYPE_NOP, + PRED_CMP_TYPE_LT, + PRED_CMP_TYPE_LE, + PRED_CMP_TYPE_GT, + PRED_CMP_TYPE_GE, + PRED_CMP_TYPE_BAND, +}; + #define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr < val; \ -} \ -static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr <= val; \ -} \ -static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \ +static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr > val; \ -} \ -static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr >= val; \ -} \ -static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return !!(*addr & val); \ -} \ -static const filter_pred_fn_t pred_funcs_##type[] = { \ - filter_pred_LE_##type, \ - filter_pred_LT_##type, \ - filter_pred_GE_##type, \ - filter_pred_GT_##type, \ - filter_pred_BAND_##type, \ -}; + switch (pred->op) { \ + case OP_LT: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr < val; \ + } \ + case OP_LE: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr <= val; \ + } \ + case OP_GT: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr > val; \ + } \ + case OP_GE: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr >= val; \ + } \ + case OP_BAND: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return !!(*addr & val); \ + } \ + default: \ + return 0; \ + } \ +} #define DEFINE_EQUALITY_PRED(size) \ static int filter_pred_##size(struct filter_pred *pred, void *event) \ @@ -836,11 +876,6 @@ static int filter_pred_comm(struct filter_pred *pred, void *event) return cmp ^ pred->not; } -static int filter_pred_none(struct filter_pred *pred, void *event) -{ - return 0; -} - /* * regex_match_foo - Basic regex callbacks * @@ -986,6 +1021,19 @@ static void filter_build_regex(struct filter_pred *pred) } } + +#ifdef CONFIG_FTRACE_STARTUP_TEST +static int test_pred_visited_fn(struct filter_pred *pred, void *event); +#else +static int test_pred_visited_fn(struct filter_pred *pred, void *event) +{ + return 0; +} +#endif + + +static int filter_pred_fn_call(struct filter_pred *pred, void *event); + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { @@ -1003,7 +1051,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) for (i = 0; prog[i].pred; i++) { struct filter_pred *pred = prog[i].pred; - int match = pred->fn(pred, rec); + int match = filter_pred_fn_call(pred, rec); if (match == prog[i].when_to_branch) i = prog[i].target; } @@ -1189,10 +1237,10 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } -static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, - int field_size, int field_is_signed) +static enum filter_pred_fn select_comparison_fn(enum filter_op_ids op, + int field_size, int field_is_signed) { - filter_pred_fn_t fn = NULL; + enum filter_pred_fn fn = FILTER_PRED_FN_NOP; int pred_func_index = -1; switch (op) { @@ -1201,50 +1249,99 @@ static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, break; default: if (WARN_ON_ONCE(op < PRED_FUNC_START)) - return NULL; + return fn; pred_func_index = op - PRED_FUNC_START; if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX)) - return NULL; + return fn; } switch (field_size) { case 8: if (pred_func_index < 0) - fn = filter_pred_64; + fn = FILTER_PRED_FN_64; else if (field_is_signed) - fn = pred_funcs_s64[pred_func_index]; + fn = FILTER_PRED_FN_S64; else - fn = pred_funcs_u64[pred_func_index]; + fn = FILTER_PRED_FN_U64; break; case 4: if (pred_func_index < 0) - fn = filter_pred_32; + fn = FILTER_PRED_FN_32; else if (field_is_signed) - fn = pred_funcs_s32[pred_func_index]; + fn = FILTER_PRED_FN_S32; else - fn = pred_funcs_u32[pred_func_index]; + fn = FILTER_PRED_FN_U32; break; case 2: if (pred_func_index < 0) - fn = filter_pred_16; + fn = FILTER_PRED_FN_16; else if (field_is_signed) - fn = pred_funcs_s16[pred_func_index]; + fn = FILTER_PRED_FN_S16; else - fn = pred_funcs_u16[pred_func_index]; + fn = FILTER_PRED_FN_U16; break; case 1: if (pred_func_index < 0) - fn = filter_pred_8; + fn = FILTER_PRED_FN_8; else if (field_is_signed) - fn = pred_funcs_s8[pred_func_index]; + fn = FILTER_PRED_FN_S8; else - fn = pred_funcs_u8[pred_func_index]; + fn = FILTER_PRED_FN_U8; break; } return fn; } + +static int filter_pred_fn_call(struct filter_pred *pred, void *event) +{ + switch (pred->fn_num) { + case FILTER_PRED_FN_64: + return filter_pred_64(pred, event); + case FILTER_PRED_FN_S64: + return filter_pred_s64(pred, event); + case FILTER_PRED_FN_U64: + return filter_pred_u64(pred, event); + case FILTER_PRED_FN_32: + return filter_pred_32(pred, event); + case FILTER_PRED_FN_S32: + return filter_pred_s32(pred, event); + case FILTER_PRED_FN_U32: + return filter_pred_u32(pred, event); + case FILTER_PRED_FN_16: + return filter_pred_16(pred, event); + case FILTER_PRED_FN_S16: + return filter_pred_s16(pred, event); + case FILTER_PRED_FN_U16: + return filter_pred_u16(pred, event); + case FILTER_PRED_FN_8: + return filter_pred_8(pred, event); + case FILTER_PRED_FN_S8: + return filter_pred_s8(pred, event); + case FILTER_PRED_FN_U8: + return filter_pred_u8(pred, event); + case FILTER_PRED_FN_COMM: + return filter_pred_comm(pred, event); + case FILTER_PRED_FN_STRING: + return filter_pred_string(pred, event); + case FILTER_PRED_FN_STRLOC: + return filter_pred_strloc(pred, event); + case FILTER_PRED_FN_STRRELLOC: + return filter_pred_strrelloc(pred, event); + case FILTER_PRED_FN_PCHAR_USER: + return filter_pred_pchar_user(pred, event); + case FILTER_PRED_FN_PCHAR: + return filter_pred_pchar(pred, event); + case FILTER_PRED_FN_CPU: + return filter_pred_cpu(pred, event); + case FILTER_PRED_TEST_VISITED: + return test_pred_visited_fn(pred, event); + default: + return 0; + } +} + /* Called when a predicate is encountered by predicate_parse() */ static int parse_pred(const char *str, void *data, int pos, struct filter_parse_error *pe, @@ -1338,7 +1435,7 @@ static int parse_pred(const char *str, void *data, parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); goto err_free; } - pred->fn = filter_pred_none; + pred->fn_num = FILTER_PRED_FN_NOP; /* * Quotes are not required, but if they exist then we need @@ -1416,16 +1513,16 @@ static int parse_pred(const char *str, void *data, filter_build_regex(pred); if (field->filter_type == FILTER_COMM) { - pred->fn = filter_pred_comm; + pred->fn_num = FILTER_PRED_FN_COMM; } else if (field->filter_type == FILTER_STATIC_STRING) { - pred->fn = filter_pred_string; + pred->fn_num = FILTER_PRED_FN_STRING; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) { - pred->fn = filter_pred_strloc; + pred->fn_num = FILTER_PRED_FN_STRLOC; } else if (field->filter_type == FILTER_RDYN_STRING) - pred->fn = filter_pred_strrelloc; + pred->fn_num = FILTER_PRED_FN_STRRELLOC; else { if (!ustring_per_cpu) { @@ -1436,9 +1533,9 @@ static int parse_pred(const char *str, void *data, } if (ustring) - pred->fn = filter_pred_pchar_user; + pred->fn_num = FILTER_PRED_FN_PCHAR_USER; else - pred->fn = filter_pred_pchar; + pred->fn_num = FILTER_PRED_FN_PCHAR; } /* go past the last quote */ i++; @@ -1486,10 +1583,10 @@ static int parse_pred(const char *str, void *data, pred->val = val; if (field->filter_type == FILTER_CPU) - pred->fn = filter_pred_cpu; + pred->fn_num = FILTER_PRED_FN_CPU; else { - pred->fn = select_comparison_fn(pred->op, field->size, - field->is_signed); + pred->fn_num = select_comparison_fn(pred->op, field->size, + field->is_signed); if (pred->op == OP_NE) pred->not = 1; } @@ -2296,7 +2393,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields) struct filter_pred *pred = prog[i].pred; struct ftrace_event_field *field = pred->field; - WARN_ON_ONCE(!pred->fn); + WARN_ON_ONCE(pred->fn_num == FILTER_PRED_FN_NOP); if (!field) { WARN_ONCE(1, "all leafs should have field defined %d", i); @@ -2306,7 +2403,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields) if (!strchr(fields, *field->name)) continue; - pred->fn = test_pred_visited_fn; + pred->fn_num = FILTER_PRED_TEST_VISITED; } } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index fdf784620c28..48465f7e97b4 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -104,6 +104,38 @@ enum field_op_id { FIELD_OP_MULT, }; +enum hist_field_fn { + HIST_FIELD_FN_NOP, + HIST_FIELD_FN_VAR_REF, + HIST_FIELD_FN_COUNTER, + HIST_FIELD_FN_CONST, + HIST_FIELD_FN_LOG2, + HIST_FIELD_FN_BUCKET, + HIST_FIELD_FN_TIMESTAMP, + HIST_FIELD_FN_CPU, + HIST_FIELD_FN_STRING, + HIST_FIELD_FN_DYNSTRING, + HIST_FIELD_FN_RELDYNSTRING, + HIST_FIELD_FN_PSTRING, + HIST_FIELD_FN_S64, + HIST_FIELD_FN_U64, + HIST_FIELD_FN_S32, + HIST_FIELD_FN_U32, + HIST_FIELD_FN_S16, + HIST_FIELD_FN_U16, + HIST_FIELD_FN_S8, + HIST_FIELD_FN_U8, + HIST_FIELD_FN_UMINUS, + HIST_FIELD_FN_MINUS, + HIST_FIELD_FN_PLUS, + HIST_FIELD_FN_DIV, + HIST_FIELD_FN_MULT, + HIST_FIELD_FN_DIV_POWER2, + HIST_FIELD_FN_DIV_NOT_POWER2, + HIST_FIELD_FN_DIV_MULT_SHIFT, + HIST_FIELD_FN_EXECNAME, +}; + /* * A hist_var (histogram variable) contains variable information for * hist_fields having the HIST_FIELD_FL_VAR or HIST_FIELD_FL_VAR_REF @@ -123,15 +155,15 @@ struct hist_var { struct hist_field { struct ftrace_event_field *field; unsigned long flags; - hist_field_fn_t fn; - unsigned int ref; - unsigned int size; - unsigned int offset; - unsigned int is_signed; unsigned long buckets; const char *type; struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; struct hist_trigger_data *hist_data; + enum hist_field_fn fn_num; + unsigned int ref; + unsigned int size; + unsigned int offset; + unsigned int is_signed; /* * Variable fields contain variable-specific info in var. @@ -166,14 +198,11 @@ struct hist_field { u64 div_multiplier; }; -static u64 hist_field_none(struct hist_field *field, - struct tracing_map_elt *elt, - struct trace_buffer *buffer, - struct ring_buffer_event *rbe, - void *event) -{ - return 0; -} +static u64 hist_fn_call(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event); static u64 hist_field_const(struct hist_field *field, struct tracing_map_elt *elt, @@ -250,7 +279,7 @@ static u64 hist_field_log2(struct hist_field *hist_field, { struct hist_field *operand = hist_field->operands[0]; - u64 val = operand->fn(operand, elt, buffer, rbe, event); + u64 val = hist_fn_call(operand, elt, buffer, rbe, event); return (u64) ilog2(roundup_pow_of_two(val)); } @@ -264,7 +293,7 @@ static u64 hist_field_bucket(struct hist_field *hist_field, struct hist_field *operand = hist_field->operands[0]; unsigned long buckets = hist_field->buckets; - u64 val = operand->fn(operand, elt, buffer, rbe, event); + u64 val = hist_fn_call(operand, elt, buffer, rbe, event); if (WARN_ON_ONCE(!buckets)) return val; @@ -285,8 +314,8 @@ static u64 hist_field_plus(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); return val1 + val2; } @@ -300,8 +329,8 @@ static u64 hist_field_minus(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); return val1 - val2; } @@ -315,8 +344,8 @@ static u64 hist_field_div(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); /* Return -1 for the undefined case */ if (!val2) @@ -338,7 +367,7 @@ static u64 div_by_power_of_two(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); return val1 >> __ffs64(operand2->constant); } @@ -352,7 +381,7 @@ static u64 div_by_not_power_of_two(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); return div64_u64(val1, operand2->constant); } @@ -366,7 +395,7 @@ static u64 div_by_mult_and_shift(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); /* * If the divisor is a constant, do a multiplication and shift instead. @@ -400,8 +429,8 @@ static u64 hist_field_mult(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); return val1 * val2; } @@ -414,7 +443,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field, { struct hist_field *operand = hist_field->operands[0]; - s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event); + s64 sval = (s64)hist_fn_call(operand, elt, buffer, rbe, event); u64 val = (u64)-sval; return val; @@ -657,19 +686,19 @@ struct snapshot_context { * Returns the specific division function to use if the divisor * is constant. This avoids extra branches when the trigger is hit. */ -static hist_field_fn_t hist_field_get_div_fn(struct hist_field *divisor) +static enum hist_field_fn hist_field_get_div_fn(struct hist_field *divisor) { u64 div = divisor->constant; if (!(div & (div - 1))) - return div_by_power_of_two; + return HIST_FIELD_FN_DIV_POWER2; /* If the divisor is too large, do a regular division */ if (div > (1 << HIST_DIV_SHIFT)) - return div_by_not_power_of_two; + return HIST_FIELD_FN_DIV_NOT_POWER2; divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div); - return div_by_mult_and_shift; + return HIST_FIELD_FN_DIV_MULT_SHIFT; } static void track_data_free(struct track_data *track_data) @@ -1334,38 +1363,32 @@ static const char *hist_field_name(struct hist_field *field, return field_name; } -static hist_field_fn_t select_value_fn(int field_size, int field_is_signed) +static enum hist_field_fn select_value_fn(int field_size, int field_is_signed) { - hist_field_fn_t fn = NULL; - switch (field_size) { case 8: if (field_is_signed) - fn = hist_field_s64; + return HIST_FIELD_FN_S64; else - fn = hist_field_u64; - break; + return HIST_FIELD_FN_U64; case 4: if (field_is_signed) - fn = hist_field_s32; + return HIST_FIELD_FN_S32; else - fn = hist_field_u32; - break; + return HIST_FIELD_FN_U32; case 2: if (field_is_signed) - fn = hist_field_s16; + return HIST_FIELD_FN_S16; else - fn = hist_field_u16; - break; + return HIST_FIELD_FN_U16; case 1: if (field_is_signed) - fn = hist_field_s8; + return HIST_FIELD_FN_S8; else - fn = hist_field_u8; - break; + return HIST_FIELD_FN_U8; } - return fn; + return HIST_FIELD_FN_NOP; } static int parse_map_size(char *str) @@ -1922,19 +1945,19 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; /* caller will populate */ if (flags & HIST_FIELD_FL_VAR_REF) { - hist_field->fn = hist_field_var_ref; + hist_field->fn_num = HIST_FIELD_FN_VAR_REF; goto out; } if (flags & HIST_FIELD_FL_HITCOUNT) { - hist_field->fn = hist_field_counter; + hist_field->fn_num = HIST_FIELD_FN_COUNTER; hist_field->size = sizeof(u64); hist_field->type = "u64"; goto out; } if (flags & HIST_FIELD_FL_CONST) { - hist_field->fn = hist_field_const; + hist_field->fn_num = HIST_FIELD_FN_CONST; hist_field->size = sizeof(u64); hist_field->type = kstrdup("u64", GFP_KERNEL); if (!hist_field->type) @@ -1943,14 +1966,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, } if (flags & HIST_FIELD_FL_STACKTRACE) { - hist_field->fn = hist_field_none; + hist_field->fn_num = HIST_FIELD_FN_NOP; goto out; } if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) { unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET); - hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 : - hist_field_bucket; + hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 : + HIST_FIELD_FN_BUCKET; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL); @@ -1960,14 +1983,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, } if (flags & HIST_FIELD_FL_TIMESTAMP) { - hist_field->fn = hist_field_timestamp; + hist_field->fn_num = HIST_FIELD_FN_TIMESTAMP; hist_field->size = sizeof(u64); hist_field->type = "u64"; goto out; } if (flags & HIST_FIELD_FL_CPU) { - hist_field->fn = hist_field_cpu; + hist_field->fn_num = HIST_FIELD_FN_CPU; hist_field->size = sizeof(int); hist_field->type = "unsigned int"; goto out; @@ -1987,14 +2010,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto free; if (field->filter_type == FILTER_STATIC_STRING) { - hist_field->fn = hist_field_string; + hist_field->fn_num = HIST_FIELD_FN_STRING; hist_field->size = field->size; } else if (field->filter_type == FILTER_DYN_STRING) { - hist_field->fn = hist_field_dynstring; + hist_field->fn_num = HIST_FIELD_FN_DYNSTRING; } else if (field->filter_type == FILTER_RDYN_STRING) - hist_field->fn = hist_field_reldynstring; + hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING; else - hist_field->fn = hist_field_pstring; + hist_field->fn_num = HIST_FIELD_FN_PSTRING; } else { hist_field->size = field->size; hist_field->is_signed = field->is_signed; @@ -2002,9 +2025,9 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (!hist_field->type) goto free; - hist_field->fn = select_value_fn(field->size, - field->is_signed); - if (!hist_field->fn) { + hist_field->fn_num = select_value_fn(field->size, + field->is_signed); + if (hist_field->fn_num == HIST_FIELD_FN_NOP) { destroy_hist_field(hist_field, 0); return NULL; } @@ -2340,7 +2363,7 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data, if (!alias) return NULL; - alias->fn = var_ref->fn; + alias->fn_num = var_ref->fn_num; alias->operands[0] = var_ref; if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) { @@ -2523,7 +2546,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, expr->flags |= operand1->flags & (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); - expr->fn = hist_field_unary_minus; + expr->fn_num = HIST_FIELD_FN_UMINUS; expr->operands[0] = operand1; expr->size = operand1->size; expr->is_signed = operand1->is_signed; @@ -2595,7 +2618,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, unsigned long operand_flags, operand2_flags; int field_op, ret = -EINVAL; char *sep, *operand1_str; - hist_field_fn_t op_fn; + enum hist_field_fn op_fn; bool combine_consts; if (*n_subexprs > 3) { @@ -2654,16 +2677,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, switch (field_op) { case FIELD_OP_MINUS: - op_fn = hist_field_minus; + op_fn = HIST_FIELD_FN_MINUS; break; case FIELD_OP_PLUS: - op_fn = hist_field_plus; + op_fn = HIST_FIELD_FN_PLUS; break; case FIELD_OP_DIV: - op_fn = hist_field_div; + op_fn = HIST_FIELD_FN_DIV; break; case FIELD_OP_MULT: - op_fn = hist_field_mult; + op_fn = HIST_FIELD_FN_MULT; break; default: ret = -EINVAL; @@ -2719,13 +2742,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, op_fn = hist_field_get_div_fn(operand2); } + expr->fn_num = op_fn; + if (combine_consts) { if (var1) expr->operands[0] = var1; if (var2) expr->operands[1] = var2; - expr->constant = op_fn(expr, NULL, NULL, NULL, NULL); + expr->constant = hist_fn_call(expr, NULL, NULL, NULL, NULL); + expr->fn_num = HIST_FIELD_FN_CONST; expr->operands[0] = NULL; expr->operands[1] = NULL; @@ -2739,8 +2765,6 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->name = expr_str(expr, 0); } else { - expr->fn = op_fn; - /* The operand sizes should be the same, so just pick one */ expr->size = operand1->size; expr->is_signed = operand1->is_signed; @@ -3065,7 +3089,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, struct hist_field *var = field_var->var; struct hist_field *val = field_var->val; - var_val = val->fn(val, elt, buffer, rbe, rec); + var_val = hist_fn_call(val, elt, buffer, rbe, rec); var_idx = var->var.idx; if (val->flags & HIST_FIELD_FL_STRING) { @@ -4186,6 +4210,74 @@ static u64 hist_field_execname(struct hist_field *hist_field, return (u64)(unsigned long)(elt_data->comm); } +static u64 hist_fn_call(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + switch (hist_field->fn_num) { + case HIST_FIELD_FN_VAR_REF: + return hist_field_var_ref(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_COUNTER: + return hist_field_counter(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_CONST: + return hist_field_const(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_LOG2: + return hist_field_log2(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_BUCKET: + return hist_field_bucket(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_TIMESTAMP: + return hist_field_timestamp(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_CPU: + return hist_field_cpu(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_STRING: + return hist_field_string(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DYNSTRING: + return hist_field_dynstring(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_RELDYNSTRING: + return hist_field_reldynstring(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_PSTRING: + return hist_field_pstring(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S64: + return hist_field_s64(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U64: + return hist_field_u64(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S32: + return hist_field_s32(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U32: + return hist_field_u32(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S16: + return hist_field_s16(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U16: + return hist_field_u16(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S8: + return hist_field_s8(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U8: + return hist_field_u8(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_UMINUS: + return hist_field_unary_minus(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_MINUS: + return hist_field_minus(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_PLUS: + return hist_field_plus(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV: + return hist_field_div(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_MULT: + return hist_field_mult(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV_POWER2: + return div_by_power_of_two(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV_NOT_POWER2: + return div_by_not_power_of_two(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV_MULT_SHIFT: + return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_EXECNAME: + return hist_field_execname(hist_field, elt, buffer, rbe, event); + default: + return 0; + } +} + /* Convert a var that points to common_pid.execname to a string */ static void update_var_execname(struct hist_field *hist_field) { @@ -4197,7 +4289,7 @@ static void update_var_execname(struct hist_field *hist_field) kfree_const(hist_field->type); hist_field->type = "char[]"; - hist_field->fn = hist_field_execname; + hist_field->fn_num = HIST_FIELD_FN_EXECNAME; } static int create_var_field(struct hist_trigger_data *hist_data, @@ -4956,7 +5048,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_val_field(i, hist_data) { hist_field = hist_data->fields[i]; - hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); + hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec); if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; @@ -4987,7 +5079,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_key_field(i, hist_data) { hist_field = hist_data->fields[i]; if (hist_field->flags & HIST_FIELD_FL_VAR) { - hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); + hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec); var_idx = hist_field->var.idx; tracing_map_set_var(elt, var_idx, hist_val); } @@ -5062,7 +5154,7 @@ static void event_hist_trigger(struct event_trigger_data *data, HIST_STACKTRACE_SKIP); key = entries; } else { - field_contents = key_field->fn(key_field, elt, buffer, rbe, rec); + field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec); if (key_field->flags & HIST_FIELD_FL_STRING) { key = (void *)(unsigned long)field_contents; use_compound_key = true; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 5e8c07aef071..29fbfb27c2b2 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -17,6 +17,8 @@ /* for gfp flag names */ #include <linux/trace_events.h> #include <trace/events/mmflags.h> +#include "trace_probe.h" +#include "trace_probe_kernel.h" #include "trace_synth.h" @@ -409,6 +411,7 @@ static unsigned int trace_string(struct synth_trace_event *entry, { unsigned int len = 0; char *str_field; + int ret; if (is_dynamic) { u32 data_offset; @@ -417,19 +420,27 @@ static unsigned int trace_string(struct synth_trace_event *entry, data_offset += event->n_u64 * sizeof(u64); data_offset += data_size; - str_field = (char *)entry + data_offset; - - len = strlen(str_val) + 1; - strscpy(str_field, str_val, len); + len = kern_fetch_store_strlen((unsigned long)str_val); data_offset |= len << 16; *(u32 *)&entry->fields[*n_u64] = data_offset; + ret = kern_fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry); + (*n_u64)++; } else { str_field = (char *)&entry->fields[*n_u64]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)str_val < TASK_SIZE) + ret = strncpy_from_user_nofault(str_field, str_val, STR_VAR_LEN_MAX); + else +#endif + ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX); + + if (ret < 0) + strcpy(str_field, FAULT_STRING); + (*n_u64) += STR_VAR_LEN_MAX / sizeof(u64); } @@ -462,7 +473,7 @@ static notrace void trace_event_raw_event_synth(void *__data, val_idx = var_ref_idx[field_pos]; str_val = (char *)(long)var_ref_vals[val_idx]; - len = strlen(str_val) + 1; + len = kern_fetch_store_strlen((unsigned long)str_val); fields_size += len; } @@ -817,10 +828,9 @@ static int register_synth_event(struct synth_event *event) } ret = set_synth_event_print_fmt(call); - if (ret < 0) { + /* unregister_trace_event() will be called inside */ + if (ret < 0) trace_remove_event_call(call); - goto err; - } out: return ret; err: diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cb866c3141af..918730d74932 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -142,7 +142,8 @@ static bool check_user_trigger(struct trace_event_file *file) { struct event_trigger_data *data; - list_for_each_entry_rcu(data, &file->triggers, list) { + list_for_each_entry_rcu(data, &file->triggers, list, + lockdep_is_held(&event_mutex)) { if (data->flags & EVENT_TRIGGER_FL_PROBE) continue; return true; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index a6621c52ce45..ae78c2d53c8a 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -14,6 +14,7 @@ #include <linux/uio.h> #include <linux/ioctl.h> #include <linux/jhash.h> +#include <linux/refcount.h> #include <linux/trace_events.h> #include <linux/tracefs.h> #include <linux/types.h> @@ -39,28 +40,69 @@ */ #define MAX_PAGE_ORDER 0 #define MAX_PAGES (1 << MAX_PAGE_ORDER) -#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE) +#define MAX_BYTES (MAX_PAGES * PAGE_SIZE) +#define MAX_EVENTS (MAX_BYTES * 8) /* Limit how long of an event name plus args within the subsystem. */ #define MAX_EVENT_DESC 512 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 -#define MAX_FIELD_ARG_NAME 256 -static char *register_page_data; +/* + * The MAP_STATUS_* macros are used for taking a index and determining the + * appropriate byte and the bit in the byte to set/reset for an event. + * + * The lower 3 bits of the index decide which bit to set. + * The remaining upper bits of the index decide which byte to use for the bit. + * + * This is used when an event has a probe attached/removed to reflect live + * status of the event wanting tracing or not to user-programs via shared + * memory maps. + */ +#define MAP_STATUS_BYTE(index) ((index) >> 3) +#define MAP_STATUS_MASK(index) BIT((index) & 7) + +/* + * Internal bits (kernel side only) to keep track of connected probes: + * These are used when status is requested in text form about an event. These + * bits are compared against an internal byte on the event to determine which + * probes to print out to the user. + * + * These do not reflect the mapped bytes between the user and kernel space. + */ +#define EVENT_STATUS_FTRACE BIT(0) +#define EVENT_STATUS_PERF BIT(1) +#define EVENT_STATUS_OTHER BIT(7) + +/* + * Stores the pages, tables, and locks for a group of events. + * Each logical grouping of events has its own group, with a + * matching page for status checks within user programs. This + * allows for isolation of events to user programs by various + * means. + */ +struct user_event_group { + struct page *pages; + char *register_page_data; + char *system_name; + struct hlist_node node; + struct mutex reg_mutex; + DECLARE_HASHTABLE(register_table, 8); + DECLARE_BITMAP(page_bitmap, MAX_EVENTS); +}; -static DEFINE_MUTEX(reg_mutex); -static DEFINE_HASHTABLE(register_table, 4); -static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); +/* Group for init_user_ns mapping, top-most group */ +static struct user_event_group *init_group; /* * Stores per-event properties, as users register events * within a file a user_event might be created if it does not * already exist. These are globally used and their lifetime * is tied to the refcnt member. These cannot go away until the - * refcnt reaches zero. + * refcnt reaches one. */ struct user_event { + struct user_event_group *group; struct tracepoint tracepoint; struct trace_event_call call; struct trace_event_class class; @@ -68,10 +110,11 @@ struct user_event { struct hlist_node node; struct list_head fields; struct list_head validators; - atomic_t refcnt; + refcount_t refcnt; int index; int flags; int min_size; + char status; }; /* @@ -86,6 +129,11 @@ struct user_event_refs { struct user_event *events[]; }; +struct user_event_file_info { + struct user_event_group *group; + struct user_event_refs *refs; +}; + #define VALIDATOR_ENSURE_NULL (1 << 0) #define VALIDATOR_REL (1 << 1) @@ -98,7 +146,8 @@ struct user_event_validator { typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, void *tpdata, bool *faulted); -static int user_event_parse(char *name, char *args, char *flags, +static int user_event_parse(struct user_event_group *group, char *name, + char *args, char *flags, struct user_event **newuser); static u32 user_event_key(char *name) @@ -106,6 +155,144 @@ static u32 user_event_key(char *name) return jhash(name, strlen(name), 0); } +static void set_page_reservations(char *pages, bool set) +{ + int page; + + for (page = 0; page < MAX_PAGES; ++page) { + void *addr = pages + (PAGE_SIZE * page); + + if (set) + SetPageReserved(virt_to_page(addr)); + else + ClearPageReserved(virt_to_page(addr)); + } +} + +static void user_event_group_destroy(struct user_event_group *group) +{ + if (group->register_page_data) + set_page_reservations(group->register_page_data, false); + + if (group->pages) + __free_pages(group->pages, MAX_PAGE_ORDER); + + kfree(group->system_name); + kfree(group); +} + +static char *user_event_group_system_name(struct user_namespace *user_ns) +{ + char *system_name; + int len = sizeof(USER_EVENTS_SYSTEM) + 1; + + if (user_ns != &init_user_ns) { + /* + * Unexpected at this point: + * We only currently support init_user_ns. + * When we enable more, this will trigger a failure so log. + */ + pr_warn("user_events: Namespace other than init_user_ns!\n"); + return NULL; + } + + system_name = kmalloc(len, GFP_KERNEL); + + if (!system_name) + return NULL; + + snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM); + + return system_name; +} + +static inline struct user_event_group +*user_event_group_from_user_ns(struct user_namespace *user_ns) +{ + if (user_ns == &init_user_ns) + return init_group; + + return NULL; +} + +static struct user_event_group *current_user_event_group(void) +{ + struct user_namespace *user_ns = current_user_ns(); + struct user_event_group *group = NULL; + + while (user_ns) { + group = user_event_group_from_user_ns(user_ns); + + if (group) + break; + + user_ns = user_ns->parent; + } + + return group; +} + +static struct user_event_group +*user_event_group_create(struct user_namespace *user_ns) +{ + struct user_event_group *group; + + group = kzalloc(sizeof(*group), GFP_KERNEL); + + if (!group) + return NULL; + + group->system_name = user_event_group_system_name(user_ns); + + if (!group->system_name) + goto error; + + group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); + + if (!group->pages) + goto error; + + group->register_page_data = page_address(group->pages); + + set_page_reservations(group->register_page_data, true); + + /* Zero all bits beside 0 (which is reserved for failures) */ + bitmap_zero(group->page_bitmap, MAX_EVENTS); + set_bit(0, group->page_bitmap); + + mutex_init(&group->reg_mutex); + hash_init(group->register_table); + + return group; +error: + if (group) + user_event_group_destroy(group); + + return NULL; +}; + +static __always_inline +void user_event_register_set(struct user_event *user) +{ + int i = user->index; + + user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i); +} + +static __always_inline +void user_event_register_clear(struct user_event *user) +{ + int i = user->index; + + user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i); +} + +static __always_inline __must_check +bool user_event_last_ref(struct user_event *user) +{ + return refcount_read(&user->refcnt) == 1; +} + static __always_inline __must_check size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i) { @@ -141,7 +328,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call) * * Upon success user_event has its ref count increased by 1. */ -static int user_event_parse_cmd(char *raw_command, struct user_event **newuser) +static int user_event_parse_cmd(struct user_event_group *group, + char *raw_command, struct user_event **newuser) { char *name = raw_command; char *args = strpbrk(name, " "); @@ -155,7 +343,7 @@ static int user_event_parse_cmd(char *raw_command, struct user_event **newuser) if (flags) *flags++ = '\0'; - return user_event_parse(name, args, flags, newuser); + return user_event_parse(group, name, args, flags, newuser); } static int user_field_array_size(const char *type) @@ -277,7 +465,7 @@ static int user_event_add_field(struct user_event *user, const char *type, goto add_field; add_validator: - if (strstr(type, "char") != 0) + if (strstr(type, "char") != NULL) validator_flags |= VALIDATOR_ENSURE_NULL; validator = kmalloc(sizeof(*validator), GFP_KERNEL); @@ -458,7 +646,7 @@ static const char *user_field_format(const char *type) return "%d"; if (strcmp(type, "unsigned char") == 0) return "%u"; - if (strstr(type, "char[") != 0) + if (strstr(type, "char[") != NULL) return "%s"; /* Unknown, likely struct, allowed treat as 64-bit */ @@ -479,10 +667,52 @@ static bool user_field_is_dyn_string(const char *type, const char **str_func) return false; check: - return strstr(type, "char") != 0; + return strstr(type, "char") != NULL; } #define LEN_OR_ZERO (len ? len - pos : 0) +static int user_dyn_field_set_string(int argc, const char **argv, int *iout, + char *buf, int len, bool *colon) +{ + int pos = 0, i = *iout; + + *colon = false; + + for (; i < argc; ++i) { + if (i != *iout) + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", argv[i]); + + if (strchr(argv[i], ';')) { + ++i; + *colon = true; + break; + } + } + + /* Actual set, advance i */ + if (len != 0) + *iout = i; + + return pos + 1; +} + +static int user_field_set_string(struct ftrace_event_field *field, + char *buf, int len, bool colon) +{ + int pos = 0; + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->type); + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name); + + if (colon) + pos += snprintf(buf + pos, LEN_OR_ZERO, ";"); + + return pos + 1; +} + static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) { struct ftrace_event_field *field, *next; @@ -600,8 +830,8 @@ static int destroy_user_event(struct user_event *user) dyn_event_remove(&user->devent); - register_page_data[user->index] = 0; - clear_bit(user->index, page_bitmap); + user_event_register_clear(user); + clear_bit(user->index, user->group->page_bitmap); hash_del(&user->node); user_event_destroy_validators(user); @@ -612,16 +842,17 @@ static int destroy_user_event(struct user_event *user) return ret; } -static struct user_event *find_user_event(char *name, u32 *outkey) +static struct user_event *find_user_event(struct user_event_group *group, + char *name, u32 *outkey) { struct user_event *user; u32 key = user_event_key(name); *outkey = key; - hash_for_each_possible(register_table, user, node, key) + hash_for_each_possible(group->register_table, user, node, key) if (!strcmp(EVENT_NAME(user), name)) { - atomic_inc(&user->refcnt); + refcount_inc(&user->refcnt); return user; } @@ -779,7 +1010,12 @@ static void update_reg_page_for(struct user_event *user) rcu_read_unlock_sched(); } - register_page_data[user->index] = status; + if (status) + user_event_register_set(user); + else + user_event_register_clear(user); + + user->status = status; } /* @@ -835,17 +1071,18 @@ static int user_event_reg(struct trace_event_call *call, return ret; inc: - atomic_inc(&user->refcnt); + refcount_inc(&user->refcnt); update_reg_page_for(user); return 0; dec: update_reg_page_for(user); - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); return 0; } static int user_event_create(const char *raw_command) { + struct user_event_group *group; struct user_event *user; char *name; int ret; @@ -861,14 +1098,19 @@ static int user_event_create(const char *raw_command) if (!name) return -ENOMEM; - mutex_lock(®_mutex); + group = current_user_event_group(); - ret = user_event_parse_cmd(name, &user); + if (!group) + return -ENOENT; + + mutex_lock(&group->reg_mutex); + + ret = user_event_parse_cmd(group, name, &user); if (!ret) - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); - mutex_unlock(®_mutex); + mutex_unlock(&group->reg_mutex); if (ret) kfree(name); @@ -910,14 +1152,14 @@ static bool user_event_is_busy(struct dyn_event *ev) { struct user_event *user = container_of(ev, struct user_event, devent); - return atomic_read(&user->refcnt) != 0; + return !user_event_last_ref(user); } static int user_event_free(struct dyn_event *ev) { struct user_event *user = container_of(ev, struct user_event, devent); - if (atomic_read(&user->refcnt) != 0) + if (!user_event_last_ref(user)) return -EBUSY; return destroy_user_event(user); @@ -926,49 +1168,35 @@ static int user_event_free(struct dyn_event *ev) static bool user_field_match(struct ftrace_event_field *field, int argc, const char **argv, int *iout) { - char *field_name, *arg_name; - int len, pos, i = *iout; + char *field_name = NULL, *dyn_field_name = NULL; bool colon = false, match = false; + int dyn_len, len; - if (i >= argc) + if (*iout >= argc) return false; - len = MAX_FIELD_ARG_NAME; - field_name = kmalloc(len, GFP_KERNEL); - arg_name = kmalloc(len, GFP_KERNEL); - - if (!arg_name || !field_name) - goto out; - - pos = 0; - - for (; i < argc; ++i) { - if (i != *iout) - pos += snprintf(arg_name + pos, len - pos, " "); + dyn_len = user_dyn_field_set_string(argc, argv, iout, dyn_field_name, + 0, &colon); - pos += snprintf(arg_name + pos, len - pos, argv[i]); + len = user_field_set_string(field, field_name, 0, colon); - if (strchr(argv[i], ';')) { - ++i; - colon = true; - break; - } - } + if (dyn_len != len) + return false; - pos = 0; + dyn_field_name = kmalloc(dyn_len, GFP_KERNEL); + field_name = kmalloc(len, GFP_KERNEL); - pos += snprintf(field_name + pos, len - pos, field->type); - pos += snprintf(field_name + pos, len - pos, " "); - pos += snprintf(field_name + pos, len - pos, field->name); + if (!dyn_field_name || !field_name) + goto out; - if (colon) - pos += snprintf(field_name + pos, len - pos, ";"); + user_dyn_field_set_string(argc, argv, iout, dyn_field_name, + dyn_len, &colon); - *iout = i; + user_field_set_string(field, field_name, len, colon); - match = strcmp(arg_name, field_name) == 0; + match = strcmp(dyn_field_name, field_name) == 0; out: - kfree(arg_name); + kfree(dyn_field_name); kfree(field_name); return match; @@ -1036,7 +1264,8 @@ static int user_event_trace_register(struct user_event *user) * The name buffer lifetime is owned by this method for success cases only. * Upon success the returned user_event has its ref count increased by 1. */ -static int user_event_parse(char *name, char *args, char *flags, +static int user_event_parse(struct user_event_group *group, char *name, + char *args, char *flags, struct user_event **newuser) { int ret; @@ -1046,7 +1275,7 @@ static int user_event_parse(char *name, char *args, char *flags, /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); - user = find_user_event(name, &key); + user = find_user_event(group, name, &key); mutex_unlock(&event_mutex); if (user) { @@ -1059,7 +1288,7 @@ static int user_event_parse(char *name, char *args, char *flags, return 0; } - index = find_first_zero_bit(page_bitmap, MAX_EVENTS); + index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS); if (index == MAX_EVENTS) return -EMFILE; @@ -1073,6 +1302,7 @@ static int user_event_parse(char *name, char *args, char *flags, INIT_LIST_HEAD(&user->fields); INIT_LIST_HEAD(&user->validators); + user->group = group; user->tracepoint.name = name; ret = user_event_parse_fields(user, args); @@ -1091,8 +1321,8 @@ static int user_event_parse(char *name, char *args, char *flags, user->call.flags = TRACE_EVENT_FL_TRACEPOINT; user->call.tp = &user->tracepoint; user->call.event.funcs = &user_event_funcs; + user->class.system = group->system_name; - user->class.system = USER_EVENTS_SYSTEM; user->class.fields_array = user_event_fields_array; user->class.get_fields = user_event_get_fields; user->class.reg = user_event_reg; @@ -1110,13 +1340,13 @@ static int user_event_parse(char *name, char *args, char *flags, user->index = index; - /* Ensure we track ref */ - atomic_inc(&user->refcnt); + /* Ensure we track self ref and caller ref (2) */ + refcount_set(&user->refcnt, 2); dyn_event_init(&user->devent, &user_event_dops); dyn_event_add(&user->devent, &user->call); - set_bit(user->index, page_bitmap); - hash_add(register_table, &user->node, key); + set_bit(user->index, group->page_bitmap); + hash_add(group->register_table, &user->node, key); mutex_unlock(&event_mutex); @@ -1134,32 +1364,20 @@ put_user: /* * Deletes a previously created event if it is no longer being used. */ -static int delete_user_event(char *name) +static int delete_user_event(struct user_event_group *group, char *name) { u32 key; - int ret; - struct user_event *user = find_user_event(name, &key); + struct user_event *user = find_user_event(group, name, &key); if (!user) return -ENOENT; - /* Ensure we are the last ref */ - if (atomic_read(&user->refcnt) != 1) { - ret = -EBUSY; - goto put_ref; - } - - ret = destroy_user_event(user); - - if (ret) - goto put_ref; + refcount_dec(&user->refcnt); - return ret; -put_ref: - /* No longer have this ref */ - atomic_dec(&user->refcnt); + if (!user_event_last_ref(user)) + return -EBUSY; - return ret; + return destroy_user_event(user); } /* @@ -1167,6 +1385,7 @@ put_ref: */ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) { + struct user_event_file_info *info = file->private_data; struct user_event_refs *refs; struct user_event *user = NULL; struct tracepoint *tp; @@ -1178,7 +1397,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) rcu_read_lock_sched(); - refs = rcu_dereference_sched(file->private_data); + refs = rcu_dereference_sched(info->refs); /* * The refs->events array is protected by RCU, and new items may be @@ -1236,6 +1455,28 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) return ret; } +static int user_events_open(struct inode *node, struct file *file) +{ + struct user_event_group *group; + struct user_event_file_info *info; + + group = current_user_event_group(); + + if (!group) + return -ENOENT; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return -ENOMEM; + + info->group = group; + + file->private_data = info; + + return 0; +} + static ssize_t user_events_write(struct file *file, const char __user *ubuf, size_t count, loff_t *ppos) { @@ -1245,7 +1486,8 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf, if (unlikely(*ppos != 0)) return -EFAULT; - if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i))) + if (unlikely(import_single_range(WRITE, (char __user *)ubuf, + count, &iov, &i))) return -EFAULT; return user_events_write_core(file, &i); @@ -1256,13 +1498,15 @@ static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i) return user_events_write_core(kp->ki_filp, i); } -static int user_events_ref_add(struct file *file, struct user_event *user) +static int user_events_ref_add(struct user_event_file_info *info, + struct user_event *user) { + struct user_event_group *group = info->group; struct user_event_refs *refs, *new_refs; int i, size, count = 0; - refs = rcu_dereference_protected(file->private_data, - lockdep_is_held(®_mutex)); + refs = rcu_dereference_protected(info->refs, + lockdep_is_held(&group->reg_mutex)); if (refs) { count = refs->count; @@ -1286,9 +1530,9 @@ static int user_events_ref_add(struct file *file, struct user_event *user) new_refs->events[i] = user; - atomic_inc(&user->refcnt); + refcount_inc(&user->refcnt); - rcu_assign_pointer(file->private_data, new_refs); + rcu_assign_pointer(info->refs, new_refs); if (refs) kfree_rcu(refs, rcu); @@ -1309,13 +1553,24 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) if (size > PAGE_SIZE) return -E2BIG; - return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); + if (size < offsetofend(struct user_reg, write_index)) + return -EINVAL; + + ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); + + if (ret) + return ret; + + kreg->size = size; + + return 0; } /* * Registers a user_event on behalf of a user process. */ -static long user_events_ioctl_reg(struct file *file, unsigned long uarg) +static long user_events_ioctl_reg(struct user_event_file_info *info, + unsigned long uarg) { struct user_reg __user *ureg = (struct user_reg __user *)uarg; struct user_reg reg; @@ -1336,24 +1591,24 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg) return ret; } - ret = user_event_parse_cmd(name, &user); + ret = user_event_parse_cmd(info->group, name, &user); if (ret) { kfree(name); return ret; } - ret = user_events_ref_add(file, user); + ret = user_events_ref_add(info, user); /* No longer need parse ref, ref_add either worked or not */ - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); /* Positive number is index and valid */ if (ret < 0) return ret; put_user((u32)ret, &ureg->write_index); - put_user(user->index, &ureg->status_index); + put_user(user->index, &ureg->status_bit); return 0; } @@ -1361,7 +1616,8 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg) /* * Deletes a user_event on behalf of a user process. */ -static long user_events_ioctl_del(struct file *file, unsigned long uarg) +static long user_events_ioctl_del(struct user_event_file_info *info, + unsigned long uarg) { void __user *ubuf = (void __user *)uarg; char *name; @@ -1374,7 +1630,7 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg) /* event_mutex prevents dyn_event from racing */ mutex_lock(&event_mutex); - ret = delete_user_event(name); + ret = delete_user_event(info->group, name); mutex_unlock(&event_mutex); kfree(name); @@ -1388,19 +1644,21 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg) static long user_events_ioctl(struct file *file, unsigned int cmd, unsigned long uarg) { + struct user_event_file_info *info = file->private_data; + struct user_event_group *group = info->group; long ret = -ENOTTY; switch (cmd) { case DIAG_IOCSREG: - mutex_lock(®_mutex); - ret = user_events_ioctl_reg(file, uarg); - mutex_unlock(®_mutex); + mutex_lock(&group->reg_mutex); + ret = user_events_ioctl_reg(info, uarg); + mutex_unlock(&group->reg_mutex); break; case DIAG_IOCSDEL: - mutex_lock(®_mutex); - ret = user_events_ioctl_del(file, uarg); - mutex_unlock(®_mutex); + mutex_lock(&group->reg_mutex); + ret = user_events_ioctl_del(info, uarg); + mutex_unlock(&group->reg_mutex); break; } @@ -1412,17 +1670,24 @@ static long user_events_ioctl(struct file *file, unsigned int cmd, */ static int user_events_release(struct inode *node, struct file *file) { + struct user_event_file_info *info = file->private_data; + struct user_event_group *group; struct user_event_refs *refs; struct user_event *user; int i; + if (!info) + return -EINVAL; + + group = info->group; + /* * Ensure refs cannot change under any situation by taking the * register mutex during the final freeing of the references. */ - mutex_lock(®_mutex); + mutex_lock(&group->reg_mutex); - refs = file->private_data; + refs = info->refs; if (!refs) goto out; @@ -1436,37 +1701,56 @@ static int user_events_release(struct inode *node, struct file *file) user = refs->events[i]; if (user) - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); } out: file->private_data = NULL; - mutex_unlock(®_mutex); + mutex_unlock(&group->reg_mutex); kfree(refs); + kfree(info); return 0; } static const struct file_operations user_data_fops = { + .open = user_events_open, .write = user_events_write, .write_iter = user_events_write_iter, .unlocked_ioctl = user_events_ioctl, .release = user_events_release, }; +static struct user_event_group *user_status_group(struct file *file) +{ + struct seq_file *m = file->private_data; + + if (!m) + return NULL; + + return m->private; +} + /* * Maps the shared page into the user process for checking if event is enabled. */ static int user_status_mmap(struct file *file, struct vm_area_struct *vma) { + char *pages; + struct user_event_group *group = user_status_group(file); unsigned long size = vma->vm_end - vma->vm_start; - if (size != MAX_EVENTS) + if (size != MAX_BYTES) + return -EINVAL; + + if (!group) return -EINVAL; + pages = group->register_page_data; + return remap_pfn_range(vma, vma->vm_start, - virt_to_phys(register_page_data) >> PAGE_SHIFT, + virt_to_phys(pages) >> PAGE_SHIFT, size, vm_get_page_prot(VM_READ)); } @@ -1490,14 +1774,18 @@ static void user_seq_stop(struct seq_file *m, void *p) static int user_seq_show(struct seq_file *m, void *p) { + struct user_event_group *group = m->private; struct user_event *user; char status; int i, active = 0, busy = 0, flags; - mutex_lock(®_mutex); + if (!group) + return -EINVAL; + + mutex_lock(&group->reg_mutex); - hash_for_each(register_table, i, user, node) { - status = register_page_data[user->index]; + hash_for_each(group->register_table, i, user, node) { + status = user->status; flags = user->flags; seq_printf(m, "%d:%s", user->index, EVENT_NAME(user)); @@ -1520,7 +1808,7 @@ static int user_seq_show(struct seq_file *m, void *p) active++; } - mutex_unlock(®_mutex); + mutex_unlock(&group->reg_mutex); seq_puts(m, "\n"); seq_printf(m, "Active: %d\n", active); @@ -1539,7 +1827,24 @@ static const struct seq_operations user_seq_ops = { static int user_status_open(struct inode *node, struct file *file) { - return seq_open(file, &user_seq_ops); + struct user_event_group *group; + int ret; + + group = current_user_event_group(); + + if (!group) + return -ENOENT; + + ret = seq_open(file, &user_seq_ops); + + if (!ret) { + /* Chain group to seq_file */ + struct seq_file *m = file->private_data; + + m->private = group; + } + + return ret; } static const struct file_operations user_status_fops = { @@ -1580,42 +1885,21 @@ err: return -ENODEV; } -static void set_page_reservations(bool set) -{ - int page; - - for (page = 0; page < MAX_PAGES; ++page) { - void *addr = register_page_data + (PAGE_SIZE * page); - - if (set) - SetPageReserved(virt_to_page(addr)); - else - ClearPageReserved(virt_to_page(addr)); - } -} - static int __init trace_events_user_init(void) { - struct page *pages; int ret; - /* Zero all bits beside 0 (which is reserved for failures) */ - bitmap_zero(page_bitmap, MAX_EVENTS); - set_bit(0, page_bitmap); + init_group = user_event_group_create(&init_user_ns); - pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); - if (!pages) + if (!init_group) return -ENOMEM; - register_page_data = page_address(pages); - - set_page_reservations(true); ret = create_user_tracefs(); if (ret) { pr_warn("user_events could not register with tracefs\n"); - set_page_reservations(false); - __free_pages(pages, MAX_PAGE_ORDER); + user_event_group_destroy(init_group); + init_group = NULL; return ret; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 23f7f0ec4f4c..5a75b039e586 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -20,6 +20,7 @@ #include "trace_kprobe_selftest.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" +#include "trace_probe_kernel.h" #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 @@ -1223,29 +1224,14 @@ static const struct file_operations kprobe_profile_ops = { static nokprobe_inline int fetch_store_strlen_user(unsigned long addr) { - const void __user *uaddr = (__force const void __user *)addr; - - return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); + return kern_fetch_store_strlen_user(addr); } /* Return the length of string -- including null terminal byte */ static nokprobe_inline int fetch_store_strlen(unsigned long addr) { - int ret, len = 0; - u8 c; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if (addr < TASK_SIZE) - return fetch_store_strlen_user(addr); -#endif - - do { - ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - - return (ret < 0) ? ret : len; + return kern_fetch_store_strlen(addr); } /* @@ -1255,21 +1241,7 @@ fetch_store_strlen(unsigned long addr) static nokprobe_inline int fetch_store_string_user(unsigned long addr, void *dest, void *base) { - const void __user *uaddr = (__force const void __user *)addr; - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string_user(addr, dest, base); } /* @@ -1279,29 +1251,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) static nokprobe_inline int fetch_store_string(unsigned long addr, void *dest, void *base) { - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if ((unsigned long)addr < TASK_SIZE) - return fetch_store_string_user(addr, dest, base); -#endif - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string(addr, dest, base); } static nokprobe_inline int diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 313439920a8c..78d536d3ff3d 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1786,8 +1786,9 @@ static int start_per_cpu_kthreads(void) for_each_cpu(cpu, current_mask) { retval = start_kthread(cpu); if (retval) { + cpus_read_unlock(); stop_per_cpu_kthreads(); - break; + return retval; } } diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 95b58bd757ce..1e130da1b742 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -95,14 +95,14 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) } lockdep_hardirqs_on_prepare(); - lockdep_hardirqs_on(CALLER_ADDR0); + lockdep_hardirqs_on(caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { - lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirqs_off(caller_addr); if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 850a88abd33b..36dff277de46 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -283,7 +283,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, int ret = 0; int len; - if (strcmp(arg, "retval") == 0) { + if (flags & TPARG_FL_TPOINT) { + if (code->data) + return -EFAULT; + code->data = kstrdup(arg, GFP_KERNEL); + if (!code->data) + return -ENOMEM; + code->op = FETCH_OP_TP_ARG; + } else if (strcmp(arg, "retval") == 0) { if (flags & TPARG_FL_RETURN) { code->op = FETCH_OP_RETVAL; } else { @@ -307,7 +314,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } } else goto inval_var; - } else if (strcmp(arg, "comm") == 0) { + } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { code->op = FETCH_OP_COMM; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API } else if (((flags & TPARG_FL_MASK) == @@ -323,13 +330,6 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; #endif - } else if (flags & TPARG_FL_TPOINT) { - if (code->data) - return -EFAULT; - code->data = kstrdup(arg, GFP_KERNEL); - if (!code->data) - return -ENOMEM; - code->op = FETCH_OP_TP_ARG; } else goto inval_var; @@ -384,6 +384,11 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '%': /* named register */ + if (flags & TPARG_FL_TPOINT) { + /* eprobes do not handle registers */ + trace_probe_log_err(offs, BAD_VAR); + break; + } ret = regs_query_register_offset(arg + 1); if (ret >= 0) { code->op = FETCH_OP_REG; @@ -617,9 +622,11 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, /* * Since $comm and immediate string can not be dereferenced, - * we can find those by strcmp. + * we can find those by strcmp. But ignore for eprobes. */ - if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { + if (!(flags & TPARG_FL_TPOINT) && + (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || + strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) goto out; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 3b3869ae8cfd..de38f1c03776 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -445,7 +445,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(SAME_PROBE, "There is already the exact same probe event"),\ C(NO_EVENT_INFO, "This requires both group and event name to attach"),\ C(BAD_ATTACH_EVENT, "Attached event does not exist"),\ - C(BAD_ATTACH_ARG, "Attached event does not have this field"), + C(BAD_ATTACH_ARG, "Attached event does not have this field"),\ + C(NO_EP_FILTER, "No filter rule after 'if'"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h new file mode 100644 index 000000000000..77dbd9ff9782 --- /dev/null +++ b/kernel/trace/trace_probe_kernel.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TRACE_PROBE_KERNEL_H_ +#define __TRACE_PROBE_KERNEL_H_ + +#define FAULT_STRING "(fault)" + +/* + * This depends on trace_probe.h, but can not include it due to + * the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c. + * Which means that any other user must include trace_probe.h before including + * this file. + */ +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +kern_fetch_store_strlen_user(unsigned long addr) +{ + const void __user *uaddr = (__force const void __user *)addr; + int ret; + + ret = strnlen_user_nofault(uaddr, MAX_STRING_SIZE); + /* + * strnlen_user_nofault returns zero on fault, insert the + * FAULT_STRING when that occurs. + */ + if (ret <= 0) + return strlen(FAULT_STRING) + 1; + return ret; +} + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +kern_fetch_store_strlen(unsigned long addr) +{ + int ret, len = 0; + u8 c; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if (addr < TASK_SIZE) + return kern_fetch_store_strlen_user(addr); +#endif + + do { + ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + /* For faults, return enough to hold the FAULT_STRING */ + return (ret < 0) ? strlen(FAULT_STRING) + 1 : len; +} + +static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base, int len) +{ + if (ret >= 0) { + *(u32 *)dest = make_data_loc(ret, __dest - base); + } else { + strscpy(__dest, FAULT_STRING, len); + ret = strlen(__dest) + 1; + } +} + +/* + * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf + * with max length and relative data location. + */ +static nokprobe_inline int +kern_fetch_store_string_user(unsigned long addr, void *dest, void *base) +{ + const void __user *uaddr = (__force const void __user *)addr; + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); + set_data_loc(ret, dest, __dest, base, maxlen); + + return ret; +} + +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. + */ +static nokprobe_inline int +kern_fetch_store_string(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)addr < TASK_SIZE) + return kern_fetch_store_string_user(addr, dest, base); +#endif + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); + set_data_loc(ret, dest, __dest, base, maxlen); + + return ret; +} + +#endif /* __TRACE_PROBE_KERNEL_H_ */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b69e207012c9..942ddbdace4a 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -201,8 +201,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags, return trace_handle_return(s); } -extern char *__bad_type_size(void); - #define SYSCALL_FIELD(_type, _name) { \ .type = #_type, .name = #_name, \ .size = sizeof(_type), .align = __alignof__(_type), \ diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 9901708ce6b8..c774e560f2f9 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -961,7 +961,7 @@ create_sort_entry(void *key, struct tracing_map_elt *elt) static void detect_dups(struct tracing_map_sort_entry **sort_entries, int n_entries, unsigned int key_size) { - unsigned int dups = 0, total_dups = 0; + unsigned int total_dups = 0; int i; void *key; @@ -974,11 +974,10 @@ static void detect_dups(struct tracing_map_sort_entry **sort_entries, key = sort_entries[0]->key; for (i = 1; i < n_entries; i++) { if (!memcmp(sort_entries[i]->key, key, key_size)) { - dups++; total_dups++; + total_dups++; continue; } key = sort_entries[i]->key; - dups = 0; } WARN_ONCE(total_dups > 0, |