From a10787e6d58c24b51e91c19c6d16c5da89fcaa4b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 15:43:14 -0800 Subject: bpf: Enable task local storage for tracing programs To access per-task data, BPF programs usually creates a hash table with pid as the key. This is not ideal because: 1. The user need to estimate the proper size of the hash table, which may be inaccurate; 2. Big hash tables are slow; 3. To clean up the data properly during task terminations, the user need to write extra logic. Task local storage overcomes these issues and offers a better option for these per-task data. Task local storage is only available to BPF_LSM. Now enable it for tracing programs. Unlike LSM programs, tracing programs can be called in IRQ contexts. Helpers that access task local storage are updated to use raw_spin_lock_irqsave() instead of raw_spin_lock_bh(). Tracing programs can attach to functions on the task free path, e.g. exit_creds(). To avoid allocating task local storage after bpf_task_storage_free(). bpf_task_storage_get() is updated to not allocate new storage when the task is not refcounted (task->usage == 0). Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: KP Singh Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210225234319.336131-2-songliubraving@fb.com --- kernel/trace/bpf_trace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/trace/bpf_trace.c') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b0c45d923f0f..e9701744d8e4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1367,6 +1367,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_task_storage_get: + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + return &bpf_task_storage_delete_proto; default: return NULL; } -- cgit v1.2.3 From 69c087ba6225b574afb6e505b72cb75242a3d844 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:25 -0800 Subject: bpf: Add bpf_for_each_map_elem() helper The bpf_for_each_map_elem() helper is introduced which iterates all map elements with a callback function. The helper signature looks like long bpf_for_each_map_elem(map, callback_fn, callback_ctx, flags) and for each map element, the callback_fn will be called. For example, like hashmap, the callback signature may look like long callback_fn(map, key, val, callback_ctx) There are two known use cases for this. One is from upstream ([1]) where a for_each_map_elem helper may help implement a timeout mechanism in a more generic way. Another is from our internal discussion for a firewall use case where a map contains all the rules. The packet data can be compared to all these rules to decide allow or deny the packet. For array maps, users can already use a bounded loop to traverse elements. Using this helper can avoid using bounded loop. For other type of maps (e.g., hash maps) where bounded loop is hard or impossible to use, this helper provides a convenient way to operate on all elements. For callback_fn, besides map and map element, a callback_ctx, allocated on caller stack, is also passed to the callback function. This callback_ctx argument can provide additional input and allow to write to caller stack for output. If the callback_fn returns 0, the helper will iterate through next element if available. If the callback_fn returns 1, the helper will stop iterating and returns to the bpf program. Other return values are not used for now. Currently, this helper is only available with jit. It is possible to make it work with interpreter with so effort but I leave it as the future work. [1]: https://lore.kernel.org/bpf/20210122205415.113822-1-xiyou.wangcong@gmail.com/ Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204925.3884923-1-yhs@fb.com --- include/linux/bpf.h | 13 +++ include/linux/bpf_verifier.h | 3 + include/uapi/linux/bpf.h | 38 ++++++++ kernel/bpf/bpf_iter.c | 16 ++++ kernel/bpf/helpers.c | 2 + kernel/bpf/verifier.c | 208 ++++++++++++++++++++++++++++++++++++++--- kernel/trace/bpf_trace.c | 2 + tools/include/uapi/linux/bpf.h | 38 ++++++++ 8 files changed, 307 insertions(+), 13 deletions(-) (limited to 'kernel/trace/bpf_trace.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e1e4d2f60527..aeb1b93a4d75 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -39,6 +39,7 @@ struct bpf_local_storage; struct bpf_local_storage_map; struct kobject; struct mem_cgroup; +struct bpf_func_state; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -129,6 +130,13 @@ struct bpf_map_ops { bool (*map_meta_equal)(const struct bpf_map *meta0, const struct bpf_map *meta1); + + int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee); + int (*map_for_each_callback)(struct bpf_map *map, void *callback_fn, + void *callback_ctx, u64 flags); + /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; @@ -295,6 +303,8 @@ enum bpf_arg_type { ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ + ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ + ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ __BPF_ARG_TYPE_MAX, }; @@ -411,6 +421,8 @@ enum bpf_reg_type { PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ + PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_MAP_KEY, /* reg points to a map element key */ }; /* The information passed from prog-specific *_is_valid_access @@ -1887,6 +1899,7 @@ extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_proto; +extern const struct bpf_func_proto bpf_for_each_map_elem_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 971b33aca13d..51c2ffa3d901 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -68,6 +68,8 @@ struct bpf_reg_state { unsigned long raw1; unsigned long raw2; } raw; + + u32 subprogno; /* for PTR_TO_FUNC */ }; /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. @@ -204,6 +206,7 @@ struct bpf_func_state { int acquired_refs; struct bpf_reference_state *refs; int allocated_stack; + bool in_callback_fn; struct bpf_stack_state *stack; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 79c893310492..b89af20cfa19 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -393,6 +393,15 @@ enum bpf_link_type { * is struct/union. */ #define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -3909,6 +3918,34 @@ union bpf_attr { * * **BPF_MTU_CHK_RET_FRAG_NEEDED** * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4075,6 +4112,7 @@ union bpf_attr { FN(ima_inode_hash), \ FN(sock_from_file), \ FN(check_mtu), \ + FN(for_each_map_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index a0d9eade9c80..931870f9cf56 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -675,3 +675,19 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) */ return ret == 0 ? 0 : -EAGAIN; } + +BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn, + void *, callback_ctx, u64, flags) +{ + return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags); +} + +const struct bpf_func_proto bpf_for_each_map_elem_proto = { + .func = bpf_for_each_map_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 308427fe03a3..074800226327 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -708,6 +708,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dbdca49ac6cc..53afe9461b03 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_CALL; } +static bool bpf_pseudo_func(const struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && + insn->src_reg == BPF_PSEUDO_FUNC; +} + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -248,6 +254,7 @@ struct bpf_call_arg_meta { u32 btf_id; struct btf *ret_btf; u32 ret_btf_id; + u32 subprogno; }; struct btf *btf_vmlinux; @@ -427,6 +434,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || + type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON; } @@ -469,7 +477,8 @@ static bool arg_type_may_be_null(enum bpf_arg_type type) type == ARG_PTR_TO_MEM_OR_NULL || type == ARG_PTR_TO_CTX_OR_NULL || type == ARG_PTR_TO_SOCKET_OR_NULL || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL || + type == ARG_PTR_TO_STACK_OR_NULL; } /* Determine whether the function releases some resources allocated by another @@ -552,6 +561,8 @@ static const char * const reg_type_str[] = { [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", [PTR_TO_RDWR_BUF] = "rdwr_buf", [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", + [PTR_TO_FUNC] = "func", + [PTR_TO_MAP_KEY] = "map_key", }; static char slot_type_char[] = { @@ -623,6 +634,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (type_is_pkt_pointer(t)) verbose(env, ",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || + t == PTR_TO_MAP_KEY || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) verbose(env, ",ks=%d,vs=%d", @@ -1555,6 +1567,19 @@ static int check_subprogs(struct bpf_verifier_env *env) /* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { + if (bpf_pseudo_func(insn + i)) { + if (!env->bpf_capable) { + verbose(env, + "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); + return -EPERM; + } + ret = add_subprog(env, i + insn[i].imm + 1); + if (ret < 0) + return ret; + /* remember subprog */ + insn[i + 1].imm = ret; + continue; + } if (!bpf_pseudo_call(insn + i)) continue; if (!env->bpf_capable) { @@ -2286,6 +2311,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: case PTR_TO_MEM_OR_NULL: + case PTR_TO_FUNC: + case PTR_TO_MAP_KEY: return true; default: return false; @@ -2890,6 +2917,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, reg = &cur_regs(env)[regno]; switch (reg->type) { + case PTR_TO_MAP_KEY: + verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n", + mem_size, off, size); + break; case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", mem_size, off, size); @@ -3295,6 +3326,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_FLOW_KEYS: pointer_desc = "flow keys "; break; + case PTR_TO_MAP_KEY: + pointer_desc = "key "; + break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -3396,7 +3430,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - if (!bpf_pseudo_call(insn + i)) + if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; @@ -3833,7 +3867,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn /* for access checks, reg->off is just part of off */ off += reg->off; - if (reg->type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_KEY) { + if (t == BPF_WRITE) { + verbose(env, "write to change key R%d not allowed\n", regno); + return -EACCES; + } + + err = check_mem_region_access(env, regno, off, size, + reg->map_ptr->key_size, false); + if (err) + return err; + if (value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into map\n", value_regno); @@ -4249,6 +4295,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MAP_KEY: + return check_mem_region_access(env, regno, reg->off, access_size, + reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: if (check_map_access_type(env, regno, reg->off, access_size, meta && meta->raw_mode ? BPF_WRITE : @@ -4465,6 +4514,7 @@ static const struct bpf_reg_types map_key_value_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4496,6 +4546,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, PTR_TO_RDONLY_BUF, @@ -4508,6 +4559,7 @@ static const struct bpf_reg_types int_ptr_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4520,6 +4572,8 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; +static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; +static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4548,6 +4602,8 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, + [ARG_PTR_TO_FUNC] = &func_ptr_types, + [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -4729,6 +4785,8 @@ skip_type_check: verbose(env, "verifier internal error\n"); return -EFAULT; } + } else if (arg_type == ARG_PTR_TO_FUNC) { + meta->subprogno = reg->subprogno; } else if (arg_type_is_mem_ptr(arg_type)) { /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. @@ -5375,6 +5433,35 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return __check_func_call(env, insn, insn_idx, subprog, set_callee_state); } +static int set_map_elem_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx]; + struct bpf_map *map; + int err; + + if (bpf_map_ptr_poisoned(insn_aux)) { + verbose(env, "tail_call abusing map_ptr\n"); + return -EINVAL; + } + + map = BPF_MAP_PTR(insn_aux->map_ptr_state); + if (!map->ops->map_set_for_each_callback_args || + !map->ops->map_for_each_callback) { + verbose(env, "callback function not allowed for map\n"); + return -ENOTSUPP; + } + + err = map->ops->map_set_for_each_callback_args(env, caller, callee); + if (err) + return err; + + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -5397,8 +5484,22 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) state->curframe--; caller = state->frame[state->curframe]; - /* return to the caller whatever r0 had in the callee */ - caller->regs[BPF_REG_0] = *r0; + if (callee->in_callback_fn) { + /* enforce R0 return value range [0, 1]. */ + struct tnum range = tnum_range(0, 1); + + if (r0->type != SCALAR_VALUE) { + verbose(env, "R0 not a scalar value\n"); + return -EACCES; + } + if (!tnum_in(range, r0->var_off)) { + verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); + return -EINVAL; + } + } else { + /* return to the caller whatever r0 had in the callee */ + caller->regs[BPF_REG_0] = *r0; + } /* Transfer references to the caller */ err = transfer_reference_state(caller, callee); @@ -5453,7 +5554,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_map_push_elem && func_id != BPF_FUNC_map_pop_elem && - func_id != BPF_FUNC_map_peek_elem) + func_id != BPF_FUNC_map_peek_elem && + func_id != BPF_FUNC_for_each_map_elem) return 0; if (map == NULL) { @@ -5534,15 +5636,18 @@ static int check_reference_leak(struct bpf_verifier_env *env) return state->acquired_refs ? -EINVAL : 0; } -static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) +static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; + int insn_idx = *insn_idx_p; bool changes_data; - int i, err; + int i, err, func_id; /* find function prototype */ + func_id = insn->imm; if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id); @@ -5638,6 +5743,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (func_id == BPF_FUNC_for_each_map_elem) { + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_map_elem_callback_state); + if (err < 0) + return -EINVAL; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); @@ -5891,6 +6003,19 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, else *ptr_limit = -off; return 0; + case PTR_TO_MAP_KEY: + /* Currently, this code is not exercised as the only use + * is bpf_for_each_map_elem() helper which requires + * bpf_capble. The code has been tested manually for + * future use. + */ + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->key_size - off; + } + return 0; case PTR_TO_MAP_VALUE: if (mask_to_left) { *ptr_limit = ptr_reg->umax_value + ptr_reg->off; @@ -6092,6 +6217,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; + case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", @@ -8271,6 +8397,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } + if (insn->src_reg == BPF_PSEUDO_FUNC) { + struct bpf_prog_aux *aux = env->prog->aux; + u32 subprogno = insn[1].imm; + + if (!aux->func_info) { + verbose(env, "missing btf func_info\n"); + return -EINVAL; + } + if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) { + verbose(env, "callback function not static\n"); + return -EINVAL; + } + + dst_reg->type = PTR_TO_FUNC; + dst_reg->subprogno = subprogno; + return 0; + } + map = env->used_maps[aux->map_index]; mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; @@ -8657,6 +8801,9 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) struct bpf_insn *insns = env->prog->insnsi; int ret; + if (bpf_pseudo_func(insns + t)) + return visit_func_call_insn(t, insn_cnt, insns, env, true); + /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insns[t].code) != BPF_JMP && BPF_CLASS(insns[t].code) != BPF_JMP32) @@ -9277,6 +9424,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, */ return false; } + case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. @@ -10123,10 +10271,9 @@ static int do_check(struct bpf_verifier_env *env) if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else - err = check_helper_call(env, insn->imm, env->insn_idx); + err = check_helper_call(env, insn, &env->insn_idx); if (err) return err; - } else if (opcode == BPF_JA) { if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || @@ -10555,6 +10702,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) goto next_insn; } + if (insn[0].src_reg == BPF_PSEUDO_FUNC) { + aux = &env->insn_aux_data[i]; + aux->ptr_type = PTR_TO_FUNC; + goto next_insn; + } + /* In final convert_pseudo_ld_imm64() step, this is * converted into regular 64-bit imm load insn. */ @@ -10687,9 +10840,13 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) int insn_cnt = env->prog->len; int i; - for (i = 0; i < insn_cnt; i++, insn++) - if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) - insn->src_reg = 0; + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) + continue; + if (insn->src_reg == BPF_PSEUDO_FUNC) + continue; + insn->src_reg = 0; + } } /* single env->prog->insni[off] instruction was replaced with the range @@ -11330,6 +11487,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + env->insn_aux_data[i].call_imm = insn->imm; + /* subprog is encoded in insn[1].imm */ + continue; + } + if (!bpf_pseudo_call(insn)) continue; /* Upon error here we cannot fall back to interpreter but @@ -11459,6 +11622,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { + if (bpf_pseudo_func(insn)) { + subprog = insn[1].imm; + insn[0].imm = (u32)(long)func[subprog]->bpf_func; + insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32; + continue; + } if (!bpf_pseudo_call(insn)) continue; subprog = insn->off; @@ -11504,6 +11673,11 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + insn[0].imm = env->insn_aux_data[i].call_imm; + insn[1].imm = find_subprog(env, i + insn[0].imm + 1); + continue; + } if (!bpf_pseudo_call(insn)) continue; insn->off = env->insn_aux_data[i].call_imm; @@ -11568,6 +11742,14 @@ static int fixup_call_args(struct bpf_verifier_env *env) return -EINVAL; } for (i = 0; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + /* When JIT fails the progs with callback calls + * have to be rejected, since interpreter doesn't support them yet. + */ + verbose(env, "callbacks are not allowed in non-JITed programs\n"); + return -EINVAL; + } + if (!bpf_pseudo_call(insn)) continue; depth = get_callee_stack_depth(env, insn, i); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e9701744d8e4..0d23755c2747 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1371,6 +1371,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: return &bpf_task_storage_delete_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 79c893310492..b89af20cfa19 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -393,6 +393,15 @@ enum bpf_link_type { * is struct/union. */ #define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -3909,6 +3918,34 @@ union bpf_attr { * * **BPF_MTU_CHK_RET_FRAG_NEEDED** * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4075,6 +4112,7 @@ union bpf_attr { FN(ima_inode_hash), \ FN(sock_from_file), \ FN(check_mtu), \ + FN(for_each_map_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From d9c9e4db186ab4d81f84e6f22b225d333b9424e3 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 19 Apr 2021 17:52:38 +0200 Subject: bpf: Factorize bpf_trace_printk and bpf_seq_printf Two helpers (trace_printk and seq_printf) have very similar implementations of format string parsing and a third one is coming (snprintf). To avoid code duplication and make the code easier to maintain, this moves the operations associated with format string parsing (validation and argument sanitization) into one generic function. The implementation of the two existing helpers already drifted quite a bit so unifying them entailed a lot of changes: - bpf_trace_printk always expected fmt[fmt_size] to be the terminating NULL character, this is no longer true, the first 0 is terminating. - bpf_trace_printk now supports %% (which produces the percentage char). - bpf_trace_printk now skips width formating fields. - bpf_trace_printk now supports the X modifier (capital hexadecimal). - bpf_trace_printk now supports %pK, %px, %pB, %pi4, %pI4, %pi6 and %pI6 - argument casting on 32 bit has been simplified into one macro and using an enum instead of obscure int increments. - bpf_seq_printf now uses bpf_trace_copy_string instead of strncpy_from_kernel_nofault and handles the %pks %pus specifiers. - bpf_seq_printf now prints longs correctly on 32 bit architectures. - both were changed to use a global per-cpu tmp buffer instead of one stack buffer for trace_printk and 6 small buffers for seq_printf. - to avoid per-cpu buffer usage conflict, these helpers disable preemption while the per-cpu buffer is in use. - both helpers now support the %ps and %pS specifiers to print symbols. The implementation is also moved from bpf_trace.c to helpers.c because the upcoming bpf_snprintf helper will be made available to all BPF programs and will need it. Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210419155243.1632274-2-revest@chromium.org --- include/linux/bpf.h | 20 +++ kernel/bpf/helpers.c | 256 ++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 371 +++++------------------------------------------ 3 files changed, 313 insertions(+), 334 deletions(-) (limited to 'kernel/trace/bpf_trace.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ff8cd68c01b3..77d1d8c65b81 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2077,4 +2077,24 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); +enum bpf_printf_mod_type { + BPF_PRINTF_INT, + BPF_PRINTF_LONG, + BPF_PRINTF_LONG_LONG, +}; + +/* Workaround for getting va_list handling working with different argument type + * combinations generically for 32 and 64 bit archs. + */ +#define BPF_CAST_FMT_ARG(arg_nb, args, mod) \ + (mod[arg_nb] == BPF_PRINTF_LONG_LONG || \ + (mod[arg_nb] == BPF_PRINTF_LONG && __BITS_PER_LONG == 64) \ + ? (u64)args[arg_nb] \ + : (u32)args[arg_nb]) + +int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u64 *final_args, enum bpf_printf_mod_type *mod, + u32 num_args); +void bpf_printf_cleanup(void); + #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f306611c4ddf..9ca57eb1fc0d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -669,6 +669,262 @@ const struct bpf_func_proto bpf_this_cpu_ptr_proto = { .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, }; +static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, + size_t bufsz) +{ + void __user *user_ptr = (__force void __user *)unsafe_ptr; + + buf[0] = 0; + + switch (fmt_ptype) { + case 's': +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)unsafe_ptr < TASK_SIZE) + return strncpy_from_user_nofault(buf, user_ptr, bufsz); + fallthrough; +#endif + case 'k': + return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); + case 'u': + return strncpy_from_user_nofault(buf, user_ptr, bufsz); + } + + return -EINVAL; +} + +/* Per-cpu temp buffers which can be used by printf-like helpers for %s or %p + */ +#define MAX_PRINTF_BUF_LEN 512 + +struct bpf_printf_buf { + char tmp_buf[MAX_PRINTF_BUF_LEN]; +}; +static DEFINE_PER_CPU(struct bpf_printf_buf, bpf_printf_buf); +static DEFINE_PER_CPU(int, bpf_printf_buf_used); + +static int try_get_fmt_tmp_buf(char **tmp_buf) +{ + struct bpf_printf_buf *bufs; + int used; + + if (*tmp_buf) + return 0; + + preempt_disable(); + used = this_cpu_inc_return(bpf_printf_buf_used); + if (WARN_ON_ONCE(used > 1)) { + this_cpu_dec(bpf_printf_buf_used); + preempt_enable(); + return -EBUSY; + } + bufs = this_cpu_ptr(&bpf_printf_buf); + *tmp_buf = bufs->tmp_buf; + + return 0; +} + +void bpf_printf_cleanup(void) +{ + if (this_cpu_read(bpf_printf_buf_used)) { + this_cpu_dec(bpf_printf_buf_used); + preempt_enable(); + } +} + +/* + * bpf_parse_fmt_str - Generic pass on format strings for printf-like helpers + * + * Returns a negative value if fmt is an invalid format string or 0 otherwise. + * + * This can be used in two ways: + * - Format string verification only: when final_args and mod are NULL + * - Arguments preparation: in addition to the above verification, it writes in + * final_args a copy of raw_args where pointers from BPF have been sanitized + * into pointers safe to use by snprintf. This also writes in the mod array + * the size requirement of each argument, usable by BPF_CAST_FMT_ARG for ex. + * + * In argument preparation mode, if 0 is returned, safe temporary buffers are + * allocated and bpf_printf_cleanup should be called to free them after use. + */ +int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u64 *final_args, enum bpf_printf_mod_type *mod, + u32 num_args) +{ + char *unsafe_ptr = NULL, *tmp_buf = NULL, *fmt_end; + size_t tmp_buf_len = MAX_PRINTF_BUF_LEN; + int err, i, num_spec = 0, copy_size; + enum bpf_printf_mod_type cur_mod; + u64 cur_arg; + char fmt_ptype; + + if (!!final_args != !!mod) + return -EINVAL; + + fmt_end = strnchr(fmt, fmt_size, 0); + if (!fmt_end) + return -EINVAL; + fmt_size = fmt_end - fmt; + + for (i = 0; i < fmt_size; i++) { + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + err = -EINVAL; + goto cleanup; + } + + if (fmt[i] != '%') + continue; + + if (fmt[i + 1] == '%') { + i++; + continue; + } + + if (num_spec >= num_args) { + err = -EINVAL; + goto cleanup; + } + + /* The string is zero-terminated so if fmt[i] != 0, we can + * always access fmt[i + 1], in the worst case it will be a 0 + */ + i++; + + /* skip optional "[0 +-][num]" width formatting field */ + while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || + fmt[i] == ' ') + i++; + if (fmt[i] >= '1' && fmt[i] <= '9') { + i++; + while (fmt[i] >= '0' && fmt[i] <= '9') + i++; + } + + if (fmt[i] == 'p') { + cur_mod = BPF_PRINTF_LONG; + + if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && + fmt[i + 2] == 's') { + fmt_ptype = fmt[i + 1]; + i += 2; + goto fmt_str; + } + + if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || + ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' || + fmt[i + 1] == 'x' || fmt[i + 1] == 'B' || + fmt[i + 1] == 's' || fmt[i + 1] == 'S') { + /* just kernel pointers */ + if (final_args) + cur_arg = raw_args[num_spec]; + goto fmt_next; + } + + /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ + if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') || + (fmt[i + 2] != '4' && fmt[i + 2] != '6')) { + err = -EINVAL; + goto cleanup; + } + + i += 2; + if (!final_args) + goto fmt_next; + + if (try_get_fmt_tmp_buf(&tmp_buf)) { + err = -EBUSY; + goto out; + } + + copy_size = (fmt[i + 2] == '4') ? 4 : 16; + if (tmp_buf_len < copy_size) { + err = -ENOSPC; + goto cleanup; + } + + unsafe_ptr = (char *)(long)raw_args[num_spec]; + err = copy_from_kernel_nofault(tmp_buf, unsafe_ptr, + copy_size); + if (err < 0) + memset(tmp_buf, 0, copy_size); + cur_arg = (u64)(long)tmp_buf; + tmp_buf += copy_size; + tmp_buf_len -= copy_size; + + goto fmt_next; + } else if (fmt[i] == 's') { + cur_mod = BPF_PRINTF_LONG; + fmt_ptype = fmt[i]; +fmt_str: + if (fmt[i + 1] != 0 && + !isspace(fmt[i + 1]) && + !ispunct(fmt[i + 1])) { + err = -EINVAL; + goto cleanup; + } + + if (!final_args) + goto fmt_next; + + if (try_get_fmt_tmp_buf(&tmp_buf)) { + err = -EBUSY; + goto out; + } + + if (!tmp_buf_len) { + err = -ENOSPC; + goto cleanup; + } + + unsafe_ptr = (char *)(long)raw_args[num_spec]; + err = bpf_trace_copy_string(tmp_buf, unsafe_ptr, + fmt_ptype, tmp_buf_len); + if (err < 0) { + tmp_buf[0] = '\0'; + err = 1; + } + + cur_arg = (u64)(long)tmp_buf; + tmp_buf += err; + tmp_buf_len -= err; + + goto fmt_next; + } + + cur_mod = BPF_PRINTF_INT; + + if (fmt[i] == 'l') { + cur_mod = BPF_PRINTF_LONG; + i++; + } + if (fmt[i] == 'l') { + cur_mod = BPF_PRINTF_LONG_LONG; + i++; + } + + if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' && + fmt[i] != 'x' && fmt[i] != 'X') { + err = -EINVAL; + goto cleanup; + } + + if (final_args) + cur_arg = raw_args[num_spec]; +fmt_next: + if (final_args) { + mod[num_spec] = cur_mod; + final_args[num_spec] = cur_arg; + } + num_spec++; + } + + err = 0; +cleanup: + if (err) + bpf_printf_cleanup(); +out: + return err; +} + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0d23755c2747..a13f8644b357 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -372,188 +372,38 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) return &bpf_probe_write_user_proto; } -static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, - size_t bufsz) -{ - void __user *user_ptr = (__force void __user *)unsafe_ptr; - - buf[0] = 0; - - switch (fmt_ptype) { - case 's': -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if ((unsigned long)unsafe_ptr < TASK_SIZE) { - strncpy_from_user_nofault(buf, user_ptr, bufsz); - break; - } - fallthrough; -#endif - case 'k': - strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); - break; - case 'u': - strncpy_from_user_nofault(buf, user_ptr, bufsz); - break; - } -} - static DEFINE_RAW_SPINLOCK(trace_printk_lock); -#define BPF_TRACE_PRINTK_SIZE 1024 +#define MAX_TRACE_PRINTK_VARARGS 3 +#define BPF_TRACE_PRINTK_SIZE 1024 -static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) +BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, + u64, arg2, u64, arg3) { + u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; + enum bpf_printf_mod_type mod[MAX_TRACE_PRINTK_VARARGS]; static char buf[BPF_TRACE_PRINTK_SIZE]; unsigned long flags; - va_list ap; int ret; - raw_spin_lock_irqsave(&trace_printk_lock, flags); - va_start(ap, fmt); - ret = vsnprintf(buf, sizeof(buf), fmt, ap); - va_end(ap); - /* vsnprintf() will not append null for zero-length strings */ + ret = bpf_printf_prepare(fmt, fmt_size, args, args, mod, + MAX_TRACE_PRINTK_VARARGS); + if (ret < 0) + return ret; + + ret = snprintf(buf, sizeof(buf), fmt, BPF_CAST_FMT_ARG(0, args, mod), + BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod)); + /* snprintf() will not append null for zero-length strings */ if (ret == 0) buf[0] = '\0'; + + raw_spin_lock_irqsave(&trace_printk_lock, flags); trace_bpf_trace_printk(buf); raw_spin_unlock_irqrestore(&trace_printk_lock, flags); - return ret; -} - -/* - * Only limited trace_printk() conversion specifiers allowed: - * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s - */ -BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, - u64, arg2, u64, arg3) -{ - int i, mod[3] = {}, fmt_cnt = 0; - char buf[64], fmt_ptype; - void *unsafe_ptr = NULL; - bool str_seen = false; + bpf_printf_cleanup(); - /* - * bpf_check()->check_func_arg()->check_stack_boundary() - * guarantees that fmt points to bpf program stack, - * fmt_size bytes of it were initialized and fmt_size > 0 - */ - if (fmt[--fmt_size] != 0) - return -EINVAL; - - /* check format string for allowed specifiers */ - for (i = 0; i < fmt_size; i++) { - if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) - return -EINVAL; - - if (fmt[i] != '%') - continue; - - if (fmt_cnt >= 3) - return -EINVAL; - - /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ - i++; - if (fmt[i] == 'l') { - mod[fmt_cnt]++; - i++; - } else if (fmt[i] == 'p') { - mod[fmt_cnt]++; - if ((fmt[i + 1] == 'k' || - fmt[i + 1] == 'u') && - fmt[i + 2] == 's') { - fmt_ptype = fmt[i + 1]; - i += 2; - goto fmt_str; - } - - if (fmt[i + 1] == 'B') { - i++; - goto fmt_next; - } - - /* disallow any further format extensions */ - if (fmt[i + 1] != 0 && - !isspace(fmt[i + 1]) && - !ispunct(fmt[i + 1])) - return -EINVAL; - - goto fmt_next; - } else if (fmt[i] == 's') { - mod[fmt_cnt]++; - fmt_ptype = fmt[i]; -fmt_str: - if (str_seen) - /* allow only one '%s' per fmt string */ - return -EINVAL; - str_seen = true; - - if (fmt[i + 1] != 0 && - !isspace(fmt[i + 1]) && - !ispunct(fmt[i + 1])) - return -EINVAL; - - switch (fmt_cnt) { - case 0: - unsafe_ptr = (void *)(long)arg1; - arg1 = (long)buf; - break; - case 1: - unsafe_ptr = (void *)(long)arg2; - arg2 = (long)buf; - break; - case 2: - unsafe_ptr = (void *)(long)arg3; - arg3 = (long)buf; - break; - } - - bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype, - sizeof(buf)); - goto fmt_next; - } - - if (fmt[i] == 'l') { - mod[fmt_cnt]++; - i++; - } - - if (fmt[i] != 'i' && fmt[i] != 'd' && - fmt[i] != 'u' && fmt[i] != 'x') - return -EINVAL; -fmt_next: - fmt_cnt++; - } - -/* Horrid workaround for getting va_list handling working with different - * argument type combinations generically for 32 and 64 bit archs. - */ -#define __BPF_TP_EMIT() __BPF_ARG3_TP() -#define __BPF_TP(...) \ - bpf_do_trace_printk(fmt, ##__VA_ARGS__) - -#define __BPF_ARG1_TP(...) \ - ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ - ? __BPF_TP(arg1, ##__VA_ARGS__) \ - : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \ - ? __BPF_TP((long)arg1, ##__VA_ARGS__) \ - : __BPF_TP((u32)arg1, ##__VA_ARGS__))) - -#define __BPF_ARG2_TP(...) \ - ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \ - ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \ - : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \ - ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \ - : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__))) - -#define __BPF_ARG3_TP(...) \ - ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \ - ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \ - : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \ - ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \ - : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__))) - - return __BPF_TP_EMIT(); + return ret; } static const struct bpf_func_proto bpf_trace_printk_proto = { @@ -581,184 +431,37 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) } #define MAX_SEQ_PRINTF_VARARGS 12 -#define MAX_SEQ_PRINTF_MAX_MEMCPY 6 -#define MAX_SEQ_PRINTF_STR_LEN 128 - -struct bpf_seq_printf_buf { - char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN]; -}; -static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf); -static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used); BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, const void *, data, u32, data_len) { - int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0; - int i, buf_used, copy_size, num_args; - u64 params[MAX_SEQ_PRINTF_VARARGS]; - struct bpf_seq_printf_buf *bufs; - const u64 *args = data; - - buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used); - if (WARN_ON_ONCE(buf_used > 1)) { - err = -EBUSY; - goto out; - } - - bufs = this_cpu_ptr(&bpf_seq_printf_buf); - - /* - * bpf_check()->check_func_arg()->check_stack_boundary() - * guarantees that fmt points to bpf program stack, - * fmt_size bytes of it were initialized and fmt_size > 0 - */ - if (fmt[--fmt_size] != 0) - goto out; - - if (data_len & 7) - goto out; - - for (i = 0; i < fmt_size; i++) { - if (fmt[i] == '%') { - if (fmt[i + 1] == '%') - i++; - else if (!data || !data_len) - goto out; - } - } + enum bpf_printf_mod_type mod[MAX_SEQ_PRINTF_VARARGS]; + u64 args[MAX_SEQ_PRINTF_VARARGS]; + int err, num_args; + if (data_len & 7 || data_len > MAX_SEQ_PRINTF_VARARGS * 8 || + (data_len && !data)) + return -EINVAL; num_args = data_len / 8; - /* check format string for allowed specifiers */ - for (i = 0; i < fmt_size; i++) { - /* only printable ascii for now. */ - if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { - err = -EINVAL; - goto out; - } - - if (fmt[i] != '%') - continue; - - if (fmt[i + 1] == '%') { - i++; - continue; - } - - if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) { - err = -E2BIG; - goto out; - } - - if (fmt_cnt >= num_args) { - err = -EINVAL; - goto out; - } - - /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ - i++; - - /* skip optional "[0 +-][num]" width formating field */ - while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || - fmt[i] == ' ') - i++; - if (fmt[i] >= '1' && fmt[i] <= '9') { - i++; - while (fmt[i] >= '0' && fmt[i] <= '9') - i++; - } - - if (fmt[i] == 's') { - void *unsafe_ptr; - - /* try our best to copy */ - if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { - err = -E2BIG; - goto out; - } - - unsafe_ptr = (void *)(long)args[fmt_cnt]; - err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt], - unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN); - if (err < 0) - bufs->buf[memcpy_cnt][0] = '\0'; - params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; - - fmt_cnt++; - memcpy_cnt++; - continue; - } - - if (fmt[i] == 'p') { - if (fmt[i + 1] == 0 || - fmt[i + 1] == 'K' || - fmt[i + 1] == 'x' || - fmt[i + 1] == 'B') { - /* just kernel pointers */ - params[fmt_cnt] = args[fmt_cnt]; - fmt_cnt++; - continue; - } - - /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ - if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') { - err = -EINVAL; - goto out; - } - if (fmt[i + 2] != '4' && fmt[i + 2] != '6') { - err = -EINVAL; - goto out; - } - - if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { - err = -E2BIG; - goto out; - } - - - copy_size = (fmt[i + 2] == '4') ? 4 : 16; - - err = copy_from_kernel_nofault(bufs->buf[memcpy_cnt], - (void *) (long) args[fmt_cnt], - copy_size); - if (err < 0) - memset(bufs->buf[memcpy_cnt], 0, copy_size); - params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; - - i += 2; - fmt_cnt++; - memcpy_cnt++; - continue; - } - - if (fmt[i] == 'l') { - i++; - if (fmt[i] == 'l') - i++; - } - - if (fmt[i] != 'i' && fmt[i] != 'd' && - fmt[i] != 'u' && fmt[i] != 'x' && - fmt[i] != 'X') { - err = -EINVAL; - goto out; - } - - params[fmt_cnt] = args[fmt_cnt]; - fmt_cnt++; - } + err = bpf_printf_prepare(fmt, fmt_size, data, args, mod, num_args); + if (err < 0) + return err; /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give * all of them to seq_printf(). */ - seq_printf(m, fmt, params[0], params[1], params[2], params[3], - params[4], params[5], params[6], params[7], params[8], - params[9], params[10], params[11]); + seq_printf(m, fmt, BPF_CAST_FMT_ARG(0, args, mod), + BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod), + BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod), + BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod), + BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod), + BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod), + BPF_CAST_FMT_ARG(11, args, mod)); - err = seq_has_overflowed(m) ? -EOVERFLOW : 0; -out: - this_cpu_dec(bpf_seq_printf_buf_used); - return err; + bpf_printf_cleanup(); + + return seq_has_overflowed(m) ? -EOVERFLOW : 0; } BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file) -- cgit v1.2.3 From 7b15523a989b63927c2bb08e9b5b0bbc10b58bef Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 19 Apr 2021 17:52:40 +0200 Subject: bpf: Add a bpf_snprintf helper The implementation takes inspiration from the existing bpf_trace_printk helper but there are a few differences: To allow for a large number of format-specifiers, parameters are provided in an array, like in bpf_seq_printf. Because the output string takes two arguments and the array of parameters also takes two arguments, the format string needs to fit in one argument. Thankfully, ARG_PTR_TO_CONST_STR is guaranteed to point to a zero-terminated read-only map so we don't need a format string length arg. Because the format-string is known at verification time, we also do a first pass of format string validation in the verifier logic. This makes debugging easier. Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210419155243.1632274-4-revest@chromium.org --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 28 +++++++++++++++++++++++ kernel/bpf/helpers.c | 50 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 41 ++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 28 +++++++++++++++++++++++ 6 files changed, 150 insertions(+) (limited to 'kernel/trace/bpf_trace.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c160526fc8bf..f8a45f109e96 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1953,6 +1953,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; +extern const struct bpf_func_proto bpf_snprintf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index df164a44bb41..ec6d85a81744 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4708,6 +4708,33 @@ union bpf_attr { * Return * The number of traversed map elements for success, **-EINVAL** for * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4875,6 +4902,7 @@ union bpf_attr { FN(sock_from_file), \ FN(check_mtu), \ FN(for_each_map_elem), \ + FN(snprintf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9ca57eb1fc0d..85b26ca5aacd 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -925,6 +925,54 @@ out: return err; } +#define MAX_SNPRINTF_VARARGS 12 + +BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, + const void *, data, u32, data_len) +{ + enum bpf_printf_mod_type mod[MAX_SNPRINTF_VARARGS]; + u64 args[MAX_SNPRINTF_VARARGS]; + int err, num_args; + + if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 || + (data_len && !data)) + return -EINVAL; + num_args = data_len / 8; + + /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we + * can safely give an unbounded size. + */ + err = bpf_printf_prepare(fmt, UINT_MAX, data, args, mod, num_args); + if (err < 0) + return err; + + /* Maximumly we can have MAX_SNPRINTF_VARARGS parameters, just give + * all of them to snprintf(). + */ + err = snprintf(str, str_size, fmt, BPF_CAST_FMT_ARG(0, args, mod), + BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod), + BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod), + BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod), + BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod), + BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod), + BPF_CAST_FMT_ARG(11, args, mod)); + + bpf_printf_cleanup(); + + return err + 1; +} + +const struct bpf_func_proto bpf_snprintf_proto = { + .func = bpf_snprintf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_CONST_STR, + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -1013,6 +1061,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_probe_read_kernel_str_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; + case BPF_FUNC_snprintf: + return &bpf_snprintf_proto; default: return NULL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5f46dd6f3383..994ef36c5f60 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5918,6 +5918,41 @@ static int check_reference_leak(struct bpf_verifier_env *env) return state->acquired_refs ? -EINVAL : 0; } +static int check_bpf_snprintf_call(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) +{ + struct bpf_reg_state *fmt_reg = ®s[BPF_REG_3]; + struct bpf_reg_state *data_len_reg = ®s[BPF_REG_5]; + struct bpf_map *fmt_map = fmt_reg->map_ptr; + int err, fmt_map_off, num_args; + u64 fmt_addr; + char *fmt; + + /* data must be an array of u64 */ + if (data_len_reg->var_off.value % 8) + return -EINVAL; + num_args = data_len_reg->var_off.value / 8; + + /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const + * and map_direct_value_addr is set. + */ + fmt_map_off = fmt_reg->off + fmt_reg->var_off.value; + err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr, + fmt_map_off); + if (err) + return err; + fmt = (char *)(long)fmt_addr + fmt_map_off; + + /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we + * can focus on validating the format specifiers. + */ + err = bpf_printf_prepare(fmt, UINT_MAX, NULL, NULL, NULL, num_args); + if (err < 0) + verbose(env, "Invalid format string\n"); + + return err; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -6032,6 +6067,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } + if (func_id == BPF_FUNC_snprintf) { + err = check_bpf_snprintf_call(env, regs); + if (err < 0) + return err; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a13f8644b357..2a8bcdc927c7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1076,6 +1076,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_storage_delete_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; + case BPF_FUNC_snprintf: + return &bpf_snprintf_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index df164a44bb41..ec6d85a81744 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4708,6 +4708,33 @@ union bpf_attr { * Return * The number of traversed map elements for success, **-EINVAL** for * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4875,6 +4902,7 @@ union bpf_attr { FN(sock_from_file), \ FN(check_mtu), \ FN(for_each_map_elem), \ + FN(snprintf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 38d26d89b31d0766d431471572cc9b007ca19c98 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Tue, 27 Apr 2021 13:29:58 +0200 Subject: bpf: Lock bpf_trace_printk's tmp buf before it is written to bpf_trace_printk uses a shared static buffer to hold strings before they are printed. A recent refactoring moved the locking of that buffer after it gets filled by mistake. Fixes: d9c9e4db186a ("bpf: Factorize bpf_trace_printk and bpf_seq_printf") Reported-by: Rasmus Villemoes Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210427112958.773132-1-revest@chromium.org --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace/bpf_trace.c') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2a8bcdc927c7..0e67d12a8f40 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -391,13 +391,13 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, if (ret < 0) return ret; + raw_spin_lock_irqsave(&trace_printk_lock, flags); ret = snprintf(buf, sizeof(buf), fmt, BPF_CAST_FMT_ARG(0, args, mod), BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod)); /* snprintf() will not append null for zero-length strings */ if (ret == 0) buf[0] = '\0'; - raw_spin_lock_irqsave(&trace_printk_lock, flags); trace_bpf_trace_printk(buf); raw_spin_unlock_irqrestore(&trace_printk_lock, flags); -- cgit v1.2.3 From 48cac3f4a96ddf08df8e53809ed066de0dc93915 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Tue, 27 Apr 2021 19:43:13 +0200 Subject: bpf: Implement formatted output helpers with bstr_printf BPF has three formatted output helpers: bpf_trace_printk, bpf_seq_printf and bpf_snprintf. Their signatures specify that all arguments are provided from the BPF world as u64s (in an array or as registers). All of these helpers are currently implemented by calling functions such as snprintf() whose signatures take a variable number of arguments, then placed in a va_list by the compiler to call vsnprintf(). "d9c9e4db bpf: Factorize bpf_trace_printk and bpf_seq_printf" introduced a bpf_printf_prepare function that fills an array of u64 sanitized arguments with an array of "modifiers" which indicate what the "real" size of each argument should be (given by the format specifier). The BPF_CAST_FMT_ARG macro consumes these arrays and casts each argument to its real size. However, the C promotion rules implicitely cast them all back to u64s. Therefore, the arguments given to snprintf are u64s and the va_list constructed by the compiler will use 64 bits for each argument. On 64 bit machines, this happens to work well because 32 bit arguments in va_lists need to occupy 64 bits anyway, but on 32 bit architectures this breaks the layout of the va_list expected by the called function and mangles values. In "88a5c690b6 bpf: fix bpf_trace_printk on 32 bit archs", this problem had been solved for bpf_trace_printk only with a "horrid workaround" that emitted multiple calls to trace_printk where each call had different argument types and generated different va_list layouts. One of the call would be dynamically chosen at runtime. This was ok with the 3 arguments that bpf_trace_printk takes but bpf_seq_printf and bpf_snprintf accept up to 12 arguments. Because this approach scales code exponentially, it is not a viable option anymore. Because the promotion rules are part of the language and because the construction of a va_list is an arch-specific ABI, it's best to just avoid variadic arguments and va_lists altogether. Thankfully the kernel's snprintf() has an alternative in the form of bstr_printf() that accepts arguments in a "binary buffer representation". These binary buffers are currently created by vbin_printf and used in the tracing subsystem to split the cost of printing into two parts: a fast one that only dereferences and remembers values, and a slower one, called later, that does the pretty-printing. This patch refactors bpf_printf_prepare to construct binary buffers of arguments consumable by bstr_printf() instead of arrays of arguments and modifiers. This gets rid of BPF_CAST_FMT_ARG and greatly simplifies the bpf_printf_prepare usage but there are a few gotchas that change how bpf_printf_prepare needs to do things. Currently, bpf_printf_prepare uses a per cpu temporary buffer as a generic storage for strings and IP addresses. With this refactoring, the temporary buffers now holds all the arguments in a structured binary format. To comply with the format expected by bstr_printf, certain format specifiers also need to be pre-formatted: %pB and %pi6/%pi4/%pI4/%pI6. Because vsnprintf subroutines for these specifiers are hard to expose, we pre-format these arguments with calls to snprintf(). Reported-by: Rasmus Villemoes Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210427174313.860948-3-revest@chromium.org --- include/linux/bpf.h | 22 +----- init/Kconfig | 1 + kernel/bpf/helpers.c | 188 +++++++++++++++++++++++++---------------------- kernel/bpf/verifier.c | 2 +- kernel/trace/bpf_trace.c | 34 +++------ 5 files changed, 115 insertions(+), 132 deletions(-) (limited to 'kernel/trace/bpf_trace.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ad4bcf1cadbb..b33f199c4cc2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2081,24 +2081,8 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); -enum bpf_printf_mod_type { - BPF_PRINTF_INT, - BPF_PRINTF_LONG, - BPF_PRINTF_LONG_LONG, -}; - -/* Workaround for getting va_list handling working with different argument type - * combinations generically for 32 and 64 bit archs. - */ -#define BPF_CAST_FMT_ARG(arg_nb, args, mod) \ - (mod[arg_nb] == BPF_PRINTF_LONG_LONG || \ - (mod[arg_nb] == BPF_PRINTF_LONG && __BITS_PER_LONG == 64) \ - ? (u64)args[arg_nb] \ - : (u32)args[arg_nb]) - -int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, - u64 *final_args, enum bpf_printf_mod_type *mod, - u32 num_args); -void bpf_printf_cleanup(void); +int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u32 **bin_buf, u32 num_args); +void bpf_bprintf_cleanup(void); #endif /* _LINUX_BPF_H */ diff --git a/init/Kconfig b/init/Kconfig index 5deae45b8d81..0d82a1f838cc 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1708,6 +1708,7 @@ config BPF_SYSCALL select BPF select IRQ_WORK select TASKS_TRACE_RCU + select BINARY_PRINTF select NET_SOCK_MSG if INET default n help diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 85b26ca5aacd..544773970dbc 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -707,9 +707,6 @@ static int try_get_fmt_tmp_buf(char **tmp_buf) struct bpf_printf_buf *bufs; int used; - if (*tmp_buf) - return 0; - preempt_disable(); used = this_cpu_inc_return(bpf_printf_buf_used); if (WARN_ON_ONCE(used > 1)) { @@ -723,7 +720,7 @@ static int try_get_fmt_tmp_buf(char **tmp_buf) return 0; } -void bpf_printf_cleanup(void) +void bpf_bprintf_cleanup(void) { if (this_cpu_read(bpf_printf_buf_used)) { this_cpu_dec(bpf_printf_buf_used); @@ -732,43 +729,45 @@ void bpf_printf_cleanup(void) } /* - * bpf_parse_fmt_str - Generic pass on format strings for printf-like helpers + * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers * * Returns a negative value if fmt is an invalid format string or 0 otherwise. * * This can be used in two ways: - * - Format string verification only: when final_args and mod are NULL + * - Format string verification only: when bin_args is NULL * - Arguments preparation: in addition to the above verification, it writes in - * final_args a copy of raw_args where pointers from BPF have been sanitized - * into pointers safe to use by snprintf. This also writes in the mod array - * the size requirement of each argument, usable by BPF_CAST_FMT_ARG for ex. + * bin_args a binary representation of arguments usable by bstr_printf where + * pointers from BPF have been sanitized. * * In argument preparation mode, if 0 is returned, safe temporary buffers are - * allocated and bpf_printf_cleanup should be called to free them after use. + * allocated and bpf_bprintf_cleanup should be called to free them after use. */ -int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, - u64 *final_args, enum bpf_printf_mod_type *mod, - u32 num_args) +int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u32 **bin_args, u32 num_args) { - char *unsafe_ptr = NULL, *tmp_buf = NULL, *fmt_end; - size_t tmp_buf_len = MAX_PRINTF_BUF_LEN; - int err, i, num_spec = 0, copy_size; - enum bpf_printf_mod_type cur_mod; + char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end; + size_t sizeof_cur_arg, sizeof_cur_ip; + int err, i, num_spec = 0; u64 cur_arg; - char fmt_ptype; - - if (!!final_args != !!mod) - return -EINVAL; + char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX"; fmt_end = strnchr(fmt, fmt_size, 0); if (!fmt_end) return -EINVAL; fmt_size = fmt_end - fmt; + if (bin_args) { + if (num_args && try_get_fmt_tmp_buf(&tmp_buf)) + return -EBUSY; + + tmp_buf_end = tmp_buf + MAX_PRINTF_BUF_LEN; + *bin_args = (u32 *)tmp_buf; + } + for (i = 0; i < fmt_size; i++) { if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { err = -EINVAL; - goto cleanup; + goto out; } if (fmt[i] != '%') @@ -781,7 +780,7 @@ int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, if (num_spec >= num_args) { err = -EINVAL; - goto cleanup; + goto out; } /* The string is zero-terminated so if fmt[i] != 0, we can @@ -800,7 +799,7 @@ int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, } if (fmt[i] == 'p') { - cur_mod = BPF_PRINTF_LONG; + sizeof_cur_arg = sizeof(long); if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && fmt[i + 2] == 's') { @@ -811,117 +810,140 @@ int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' || - fmt[i + 1] == 'x' || fmt[i + 1] == 'B' || - fmt[i + 1] == 's' || fmt[i + 1] == 'S') { + fmt[i + 1] == 'x' || fmt[i + 1] == 's' || + fmt[i + 1] == 'S') { /* just kernel pointers */ - if (final_args) + if (tmp_buf) cur_arg = raw_args[num_spec]; - goto fmt_next; + i++; + goto nocopy_fmt; + } + + if (fmt[i + 1] == 'B') { + if (tmp_buf) { + err = snprintf(tmp_buf, + (tmp_buf_end - tmp_buf), + "%pB", + (void *)(long)raw_args[num_spec]); + tmp_buf += (err + 1); + } + + i++; + num_spec++; + continue; } /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') || (fmt[i + 2] != '4' && fmt[i + 2] != '6')) { err = -EINVAL; - goto cleanup; + goto out; } i += 2; - if (!final_args) - goto fmt_next; + if (!tmp_buf) + goto nocopy_fmt; - if (try_get_fmt_tmp_buf(&tmp_buf)) { - err = -EBUSY; - goto out; - } - - copy_size = (fmt[i + 2] == '4') ? 4 : 16; - if (tmp_buf_len < copy_size) { + sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16; + if (tmp_buf_end - tmp_buf < sizeof_cur_ip) { err = -ENOSPC; - goto cleanup; + goto out; } unsafe_ptr = (char *)(long)raw_args[num_spec]; - err = copy_from_kernel_nofault(tmp_buf, unsafe_ptr, - copy_size); + err = copy_from_kernel_nofault(cur_ip, unsafe_ptr, + sizeof_cur_ip); if (err < 0) - memset(tmp_buf, 0, copy_size); - cur_arg = (u64)(long)tmp_buf; - tmp_buf += copy_size; - tmp_buf_len -= copy_size; + memset(cur_ip, 0, sizeof_cur_ip); + + /* hack: bstr_printf expects IP addresses to be + * pre-formatted as strings, ironically, the easiest way + * to do that is to call snprintf. + */ + ip_spec[2] = fmt[i - 1]; + ip_spec[3] = fmt[i]; + err = snprintf(tmp_buf, tmp_buf_end - tmp_buf, + ip_spec, &cur_ip); - goto fmt_next; + tmp_buf += err + 1; + num_spec++; + + continue; } else if (fmt[i] == 's') { - cur_mod = BPF_PRINTF_LONG; fmt_ptype = fmt[i]; fmt_str: if (fmt[i + 1] != 0 && !isspace(fmt[i + 1]) && !ispunct(fmt[i + 1])) { err = -EINVAL; - goto cleanup; - } - - if (!final_args) - goto fmt_next; - - if (try_get_fmt_tmp_buf(&tmp_buf)) { - err = -EBUSY; goto out; } - if (!tmp_buf_len) { + if (!tmp_buf) + goto nocopy_fmt; + + if (tmp_buf_end == tmp_buf) { err = -ENOSPC; - goto cleanup; + goto out; } unsafe_ptr = (char *)(long)raw_args[num_spec]; err = bpf_trace_copy_string(tmp_buf, unsafe_ptr, - fmt_ptype, tmp_buf_len); + fmt_ptype, + tmp_buf_end - tmp_buf); if (err < 0) { tmp_buf[0] = '\0'; err = 1; } - cur_arg = (u64)(long)tmp_buf; tmp_buf += err; - tmp_buf_len -= err; + num_spec++; - goto fmt_next; + continue; } - cur_mod = BPF_PRINTF_INT; + sizeof_cur_arg = sizeof(int); if (fmt[i] == 'l') { - cur_mod = BPF_PRINTF_LONG; + sizeof_cur_arg = sizeof(long); i++; } if (fmt[i] == 'l') { - cur_mod = BPF_PRINTF_LONG_LONG; + sizeof_cur_arg = sizeof(long long); i++; } if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x' && fmt[i] != 'X') { err = -EINVAL; - goto cleanup; + goto out; } - if (final_args) + if (tmp_buf) cur_arg = raw_args[num_spec]; -fmt_next: - if (final_args) { - mod[num_spec] = cur_mod; - final_args[num_spec] = cur_arg; +nocopy_fmt: + if (tmp_buf) { + tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32)); + if (tmp_buf_end - tmp_buf < sizeof_cur_arg) { + err = -ENOSPC; + goto out; + } + + if (sizeof_cur_arg == 8) { + *(u32 *)tmp_buf = *(u32 *)&cur_arg; + *(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1); + } else { + *(u32 *)tmp_buf = (u32)(long)cur_arg; + } + tmp_buf += sizeof_cur_arg; } num_spec++; } err = 0; -cleanup: - if (err) - bpf_printf_cleanup(); out: + if (err) + bpf_bprintf_cleanup(); return err; } @@ -930,9 +952,8 @@ out: BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, const void *, data, u32, data_len) { - enum bpf_printf_mod_type mod[MAX_SNPRINTF_VARARGS]; - u64 args[MAX_SNPRINTF_VARARGS]; int err, num_args; + u32 *bin_args; if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 || (data_len && !data)) @@ -942,22 +963,13 @@ BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we * can safely give an unbounded size. */ - err = bpf_printf_prepare(fmt, UINT_MAX, data, args, mod, num_args); + err = bpf_bprintf_prepare(fmt, UINT_MAX, data, &bin_args, num_args); if (err < 0) return err; - /* Maximumly we can have MAX_SNPRINTF_VARARGS parameters, just give - * all of them to snprintf(). - */ - err = snprintf(str, str_size, fmt, BPF_CAST_FMT_ARG(0, args, mod), - BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod), - BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod), - BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod), - BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod), - BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod), - BPF_CAST_FMT_ARG(11, args, mod)); - - bpf_printf_cleanup(); + err = bstr_printf(str, str_size, fmt, bin_args); + + bpf_bprintf_cleanup(); return err + 1; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9145f88b2a0a..8fd552c16763 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5946,7 +5946,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we * can focus on validating the format specifiers. */ - err = bpf_printf_prepare(fmt, UINT_MAX, NULL, NULL, NULL, num_args); + err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, NULL, num_args); if (err < 0) verbose(env, "Invalid format string\n"); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0e67d12a8f40..d2d7cf6cfe83 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -381,27 +381,23 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) { u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; - enum bpf_printf_mod_type mod[MAX_TRACE_PRINTK_VARARGS]; + u32 *bin_args; static char buf[BPF_TRACE_PRINTK_SIZE]; unsigned long flags; int ret; - ret = bpf_printf_prepare(fmt, fmt_size, args, args, mod, - MAX_TRACE_PRINTK_VARARGS); + ret = bpf_bprintf_prepare(fmt, fmt_size, args, &bin_args, + MAX_TRACE_PRINTK_VARARGS); if (ret < 0) return ret; raw_spin_lock_irqsave(&trace_printk_lock, flags); - ret = snprintf(buf, sizeof(buf), fmt, BPF_CAST_FMT_ARG(0, args, mod), - BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod)); - /* snprintf() will not append null for zero-length strings */ - if (ret == 0) - buf[0] = '\0'; + ret = bstr_printf(buf, sizeof(buf), fmt, bin_args); trace_bpf_trace_printk(buf); raw_spin_unlock_irqrestore(&trace_printk_lock, flags); - bpf_printf_cleanup(); + bpf_bprintf_cleanup(); return ret; } @@ -435,31 +431,21 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, const void *, data, u32, data_len) { - enum bpf_printf_mod_type mod[MAX_SEQ_PRINTF_VARARGS]; - u64 args[MAX_SEQ_PRINTF_VARARGS]; int err, num_args; + u32 *bin_args; if (data_len & 7 || data_len > MAX_SEQ_PRINTF_VARARGS * 8 || (data_len && !data)) return -EINVAL; num_args = data_len / 8; - err = bpf_printf_prepare(fmt, fmt_size, data, args, mod, num_args); + err = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args); if (err < 0) return err; - /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give - * all of them to seq_printf(). - */ - seq_printf(m, fmt, BPF_CAST_FMT_ARG(0, args, mod), - BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod), - BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod), - BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod), - BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod), - BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod), - BPF_CAST_FMT_ARG(11, args, mod)); - - bpf_printf_cleanup(); + seq_bprintf(m, fmt, bin_args); + + bpf_bprintf_cleanup(); return seq_has_overflowed(m) ? -EOVERFLOW : 0; } -- cgit v1.2.3