diff options
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/Makefile | 5 | ||||
-rw-r--r-- | kernel/bpf/arraymap.c | 168 | ||||
-rw-r--r-- | kernel/bpf/core.c | 308 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 458 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 17 | ||||
-rw-r--r-- | kernel/bpf/inode.c | 44 | ||||
-rw-r--r-- | kernel/bpf/percpu_freelist.c | 100 | ||||
-rw-r--r-- | kernel/bpf/percpu_freelist.h | 31 | ||||
-rw-r--r-- | kernel/bpf/stackmap.c | 290 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 122 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 831 |
11 files changed, 2066 insertions, 308 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 13272582eee0..eed911d091da 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,4 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_BPF_SYSCALL) += stackmap.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 89ebbc4d1164..76d5a794e426 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -17,15 +17,43 @@ #include <linux/filter.h> #include <linux/perf_event.h> +static void bpf_array_free_percpu(struct bpf_array *array) +{ + int i; + + for (i = 0; i < array->map.max_entries; i++) + free_percpu(array->pptrs[i]); +} + +static int bpf_array_alloc_percpu(struct bpf_array *array) +{ + void __percpu *ptr; + int i; + + for (i = 0; i < array->map.max_entries; i++) { + ptr = __alloc_percpu_gfp(array->elem_size, 8, + GFP_USER | __GFP_NOWARN); + if (!ptr) { + bpf_array_free_percpu(array); + return -ENOMEM; + } + array->pptrs[i] = ptr; + } + + return 0; +} + /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; struct bpf_array *array; - u32 elem_size, array_size; + u64 array_size; + u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0) + attr->value_size == 0 || attr->map_flags) return ERR_PTR(-EINVAL); if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) @@ -36,12 +64,16 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); - /* check round_up into zero and u32 overflow */ - if (elem_size == 0 || - attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size) + array_size = sizeof(*array); + if (percpu) + array_size += (u64) attr->max_entries * sizeof(void *); + else + array_size += (u64) attr->max_entries * elem_size; + + /* make sure there is no u32 overflow later in round_up() */ + if (array_size >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); - array_size = sizeof(*array) + attr->max_entries * elem_size; /* allocate all map elements and zero-initialize them */ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); @@ -52,12 +84,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } /* copy mandatory map attributes */ + array->map.map_type = attr->map_type; array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; - array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; array->elem_size = elem_size; + if (!percpu) + goto out; + + array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); + + if (array_size >= U32_MAX - PAGE_SIZE || + elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { + kvfree(array); + return ERR_PTR(-ENOMEM); + } +out: + array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; + return &array->map; } @@ -67,12 +112,50 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) return NULL; return array->value + array->elem_size * index; } +/* Called from eBPF program */ +static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return this_cpu_ptr(array->pptrs[index]); +} + +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(index >= array->map.max_entries)) + return -ENOENT; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); + off += size; + } + rcu_read_unlock(); + return 0; +} + /* Called from syscall */ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -99,19 +182,62 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; - if (map_flags == BPF_NOEXIST) + if (unlikely(map_flags == BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; - memcpy(array->value + array->elem_size * index, value, map->value_size); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + memcpy(this_cpu_ptr(array->pptrs[index]), + value, map->value_size); + else + memcpy(array->value + array->elem_size * index, + value, map->value_size); + return 0; +} + +int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + if (unlikely(index >= array->map.max_entries)) + /* all elements were pre-allocated, cannot insert a new one */ + return -E2BIG; + + if (unlikely(map_flags == BPF_NOEXIST)) + /* all elements already exist */ + return -EEXIST; + + /* the user space will provide round_up(value_size, 8) bytes that + * will be copied into per-cpu area. bpf programs can only access + * value_size of it. During lookup the same extra bytes will be + * returned or zeros which were zero-filled by percpu_alloc, + * so no kernel data leaks possible + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); + off += size; + } + rcu_read_unlock(); return 0; } @@ -133,6 +259,9 @@ static void array_map_free(struct bpf_map *map) */ synchronize_rcu(); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + bpf_array_free_percpu(array); + kvfree(array); } @@ -150,9 +279,24 @@ static struct bpf_map_type_list array_type __read_mostly = { .type = BPF_MAP_TYPE_ARRAY, }; +static const struct bpf_map_ops percpu_array_ops = { + .map_alloc = array_map_alloc, + .map_free = array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = percpu_array_map_lookup_elem, + .map_update_elem = array_map_update_elem, + .map_delete_elem = array_map_delete_elem, +}; + +static struct bpf_map_type_list percpu_array_type __read_mostly = { + .ops = &percpu_array_ops, + .type = BPF_MAP_TYPE_PERCPU_ARRAY, +}; + static int __init register_array_map(void) { bpf_register_map_type(&array_type); + bpf_register_map_type(&percpu_array_type); return 0; } late_initcall(register_array_map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 972d9a8e4ac4..f1e8a0def99b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -27,6 +27,7 @@ #include <linux/random.h> #include <linux/moduleloader.h> #include <linux/bpf.h> +#include <linux/frame.h> #include <asm/unaligned.h> @@ -128,14 +129,83 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, return fp; } -EXPORT_SYMBOL_GPL(bpf_prog_realloc); void __bpf_prog_free(struct bpf_prog *fp) { kfree(fp->aux); vfree(fp); } -EXPORT_SYMBOL_GPL(__bpf_prog_free); + +static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_JMP && + /* Call and Exit are both special jumps with no + * target inside the BPF instruction image. + */ + BPF_OP(insn->code) != BPF_CALL && + BPF_OP(insn->code) != BPF_EXIT; +} + +static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) +{ + struct bpf_insn *insn = prog->insnsi; + u32 i, insn_cnt = prog->len; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (!bpf_is_jmp_and_has_target(insn)) + continue; + + /* Adjust offset of jmps if we cross boundaries. */ + if (i < pos && i + insn->off + 1 > pos) + insn->off += delta; + else if (i > pos + delta && i + insn->off + 1 <= pos + delta) + insn->off -= delta; + } +} + +struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, + const struct bpf_insn *patch, u32 len) +{ + u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; + struct bpf_prog *prog_adj; + + /* Since our patchlet doesn't expand the image, we're done. */ + if (insn_delta == 0) { + memcpy(prog->insnsi + off, patch, sizeof(*patch)); + return prog; + } + + insn_adj_cnt = prog->len + insn_delta; + + /* Several new instructions need to be inserted. Make room + * for them. Likely, there's no need for a new allocation as + * last page could have large enough tailroom. + */ + prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), + GFP_USER); + if (!prog_adj) + return NULL; + + prog_adj->len = insn_adj_cnt; + + /* Patching happens in 3 steps: + * + * 1) Move over tail of insnsi from next instruction onwards, + * so we can patch the single target insn with one or more + * new ones (patching is always from 1 to n insns, n > 0). + * 2) Inject new instructions at the target location. + * 3) Adjust branch offsets if necessary. + */ + insn_rest = insn_adj_cnt - off - len; + + memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1, + sizeof(*patch) * insn_rest); + memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len); + + bpf_adj_branches(prog_adj, off, insn_delta); + + return prog_adj; +} #ifdef CONFIG_BPF_JIT struct bpf_binary_header * @@ -173,6 +243,209 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) { module_memfree(hdr); } + +int bpf_jit_harden __read_mostly; + +static int bpf_jit_blind_insn(const struct bpf_insn *from, + const struct bpf_insn *aux, + struct bpf_insn *to_buff) +{ + struct bpf_insn *to = to_buff; + u32 imm_rnd = prandom_u32(); + s16 off; + + BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); + BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + + if (from->imm == 0 && + (from->code == (BPF_ALU | BPF_MOV | BPF_K) || + from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { + *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg); + goto out; + } + + switch (from->code) { + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_MOV | BPF_K: + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_K: + *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX); + break; + + case BPF_ALU64 | BPF_ADD | BPF_K: + case BPF_ALU64 | BPF_SUB | BPF_K: + case BPF_ALU64 | BPF_AND | BPF_K: + case BPF_ALU64 | BPF_OR | BPF_K: + case BPF_ALU64 | BPF_XOR | BPF_K: + case BPF_ALU64 | BPF_MUL | BPF_K: + case BPF_ALU64 | BPF_MOV | BPF_K: + case BPF_ALU64 | BPF_DIV | BPF_K: + case BPF_ALU64 | BPF_MOD | BPF_K: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX); + break; + + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JNE | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSET | BPF_K: + /* Accommodate for extra offset in case of a backjump. */ + off = from->off; + if (off < 0) + off -= 2; + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); + break; + + case BPF_LD | BPF_ABS | BPF_W: + case BPF_LD | BPF_ABS | BPF_H: + case BPF_LD | BPF_ABS | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); + break; + + case BPF_LD | BPF_IND | BPF_W: + case BPF_LD | BPF_IND | BPF_H: + case BPF_LD | BPF_IND | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg); + *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); + break; + + case BPF_LD | BPF_IMM | BPF_DW: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); + *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX); + break; + case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ + *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); + *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); + break; + + case BPF_ST | BPF_MEM | BPF_DW: + case BPF_ST | BPF_MEM | BPF_W: + case BPF_ST | BPF_MEM | BPF_H: + case BPF_ST | BPF_MEM | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off); + break; + } +out: + return to - to_buff; +} + +static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, + gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog *fp; + + fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); + if (fp != NULL) { + kmemcheck_annotate_bitfield(fp, meta); + + /* aux->prog still points to the fp_other one, so + * when promoting the clone to the real program, + * this still needs to be adapted. + */ + memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE); + } + + return fp; +} + +static void bpf_prog_clone_free(struct bpf_prog *fp) +{ + /* aux was stolen by the other clone, so we cannot free + * it from this path! It will be freed eventually by the + * other program on release. + * + * At this point, we don't need a deferred release since + * clone is guaranteed to not be locked. + */ + fp->aux = NULL; + __bpf_prog_free(fp); +} + +void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) +{ + /* We have to repoint aux->prog to self, as we don't + * know whether fp here is the clone or the original. + */ + fp->aux->prog = fp; + bpf_prog_clone_free(fp_other); +} + +struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) +{ + struct bpf_insn insn_buff[16], aux[2]; + struct bpf_prog *clone, *tmp; + int insn_delta, insn_cnt; + struct bpf_insn *insn; + int i, rewritten; + + if (!bpf_jit_blinding_enabled()) + return prog; + + clone = bpf_prog_clone_create(prog, GFP_USER); + if (!clone) + return ERR_PTR(-ENOMEM); + + insn_cnt = clone->len; + insn = clone->insnsi; + + for (i = 0; i < insn_cnt; i++, insn++) { + /* We temporarily need to hold the original ld64 insn + * so that we can still access the first part in the + * second blinding run. + */ + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) && + insn[1].code == 0) + memcpy(aux, insn, sizeof(aux)); + + rewritten = bpf_jit_blind_insn(insn, aux, insn_buff); + if (!rewritten) + continue; + + tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); + if (!tmp) { + /* Patching may have repointed aux->prog during + * realloc from the original one, so we need to + * fix it up here on error. + */ + bpf_jit_prog_release_other(prog, clone); + return ERR_PTR(-ENOMEM); + } + + clone = tmp; + insn_delta = rewritten - 1; + + /* Walk new program and skip insns we just inserted. */ + insn = clone->insnsi + i + insn_delta; + insn_cnt += insn_delta; + i += insn_delta; + } + + return clone; +} #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, @@ -649,6 +922,7 @@ load_byte: WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); return 0; } +STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) @@ -690,15 +964,22 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) /** * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with internal BPF program + * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. * The BPF program will be executed via BPF_PROG_RUN() macro. */ -int bpf_prog_select_runtime(struct bpf_prog *fp) +struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { fp->bpf_func = (void *) __bpf_prog_run; - bpf_int_jit_compile(fp); + /* eBPF JITs can rewrite the program in case constant + * blinding is active. However, in case of error during + * blinding, bpf_int_jit_compile() must always return a + * valid program, which in this case would simply not + * be JITed, but falls back to the interpreter. + */ + fp = bpf_int_jit_compile(fp); bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at @@ -706,7 +987,9 @@ int bpf_prog_select_runtime(struct bpf_prog *fp) * with JITed or non JITed program concatenations and not * all eBPF JITs might immediately support all features. */ - return bpf_check_tail_call(fp); + *err = bpf_check_tail_call(fp); + + return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); @@ -762,14 +1045,21 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; + const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; + const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { return NULL; } +const struct bpf_func_proto * __weak bpf_get_event_output_proto(void) +{ + return NULL; +} + /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { .func = NULL, @@ -781,8 +1071,14 @@ const struct bpf_func_proto bpf_tail_call_proto = { }; /* For classic BPF JITs that don't implement bpf_int_jit_compile(). */ -void __weak bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) +{ + return prog; +} + +bool __weak bpf_helper_changes_skb_data(void *func) { + return false; } /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c5b30fd8a315..fff3650d52fc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -13,6 +14,7 @@ #include <linux/jhash.h> #include <linux/filter.h> #include <linux/vmalloc.h> +#include "percpu_freelist.h" struct bucket { struct hlist_head head; @@ -22,6 +24,8 @@ struct bucket { struct bpf_htab { struct bpf_map map; struct bucket *buckets; + void *elems; + struct pcpu_freelist freelist; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ @@ -29,26 +33,108 @@ struct bpf_htab { /* each htab element is struct htab_elem + key + value */ struct htab_elem { - struct hlist_node hash_node; + union { + struct hlist_node hash_node; + struct bpf_htab *htab; + struct pcpu_freelist_node fnode; + }; struct rcu_head rcu; u32 hash; char key[0] __aligned(8); }; +static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, + void __percpu *pptr) +{ + *(void __percpu **)(l->key + key_size) = pptr; +} + +static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) +{ + return *(void __percpu **)(l->key + key_size); +} + +static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) +{ + return (struct htab_elem *) (htab->elems + i * htab->elem_size); +} + +static void htab_free_elems(struct bpf_htab *htab) +{ + int i; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto free_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + void __percpu *pptr; + + pptr = htab_elem_get_ptr(get_htab_elem(htab, i), + htab->map.key_size); + free_percpu(pptr); + } +free_elems: + vfree(htab->elems); +} + +static int prealloc_elems_and_freelist(struct bpf_htab *htab) +{ + int err = -ENOMEM, i; + + htab->elems = vzalloc(htab->elem_size * htab->map.max_entries); + if (!htab->elems) + return -ENOMEM; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto skip_percpu_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + u32 size = round_up(htab->map.value_size, 8); + void __percpu *pptr; + + pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); + if (!pptr) + goto free_elems; + htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, + pptr); + } + +skip_percpu_elems: + err = pcpu_freelist_init(&htab->freelist); + if (err) + goto free_elems; + + pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, + htab->map.max_entries); + return 0; + +free_elems: + htab_free_elems(htab); + return err; +} + /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; struct bpf_htab *htab; int err, i; + u64 cost; + + if (attr->map_flags & ~BPF_F_NO_PREALLOC) + /* reserved bits should not be used */ + return ERR_PTR(-EINVAL); htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); /* mandatory map attributes */ + htab->map.map_type = attr->map_type; htab->map.key_size = attr->key_size; htab->map.value_size = attr->value_size; htab->map.max_entries = attr->max_entries; + htab->map.map_flags = attr->map_flags; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set @@ -77,24 +163,39 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; + if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) + /* make sure the size for pcpu_alloc() is reasonable */ + goto free_htab; + htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8) + - htab->map.value_size; + round_up(htab->map.key_size, 8); + if (percpu) + htab->elem_size += sizeof(void *); + else + htab->elem_size += round_up(htab->map.value_size, 8); /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; - if ((u64) htab->n_buckets * sizeof(struct bucket) + - (u64) htab->elem_size * htab->map.max_entries >= - U32_MAX - PAGE_SIZE) + cost = (u64) htab->n_buckets * sizeof(struct bucket) + + (u64) htab->elem_size * htab->map.max_entries; + + if (percpu) + cost += (u64) round_up(htab->map.value_size, 8) * + num_possible_cpus() * htab->map.max_entries; + + if (cost >= U32_MAX - PAGE_SIZE) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) + - htab->elem_size * htab->map.max_entries, - PAGE_SIZE) >> PAGE_SHIFT; + htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* if map size is larger than memlock limit, reject it early */ + err = bpf_map_precharge_memlock(htab->map.pages); + if (err) + goto free_htab; err = -ENOMEM; htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), @@ -111,10 +212,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } - atomic_set(&htab->count, 0); + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { + err = prealloc_elems_and_freelist(htab); + if (err) + goto free_buckets; + } return &htab->map; +free_buckets: + kvfree(htab->buckets); free_htab: kfree(htab); return ERR_PTR(err); @@ -148,7 +255,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, } /* Called from syscall or from eBPF program */ -static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_head *head; @@ -166,6 +273,13 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); + return l; +} + +static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + if (l) return l->key + round_up(map->key_size, 8); @@ -226,86 +340,248 @@ find_first_elem: } } - /* itereated over all buckets and all elements */ + /* iterated over all buckets and all elements */ return -ENOENT; } +static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) +{ + if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) + free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); + kfree(l); + +} + +static void htab_elem_free_rcu(struct rcu_head *head) +{ + struct htab_elem *l = container_of(head, struct htab_elem, rcu); + struct bpf_htab *htab = l->htab; + + /* must increment bpf_prog_active to avoid kprobe+bpf triggering while + * we're calling kfree, otherwise deadlock is possible if kprobes + * are placed somewhere inside of slub + */ + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + htab_elem_free(htab, l); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); +} + +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +{ + if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { + pcpu_freelist_push(&htab->freelist, &l->fnode); + } else { + atomic_dec(&htab->count); + l->htab = htab; + call_rcu(&l->rcu, htab_elem_free_rcu); + } +} + +static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, + void *value, u32 key_size, u32 hash, + bool percpu, bool onallcpus) +{ + u32 size = htab->map.value_size; + bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); + struct htab_elem *l_new; + void __percpu *pptr; + + if (prealloc) { + l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); + if (!l_new) + return ERR_PTR(-E2BIG); + } else { + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); + } + + memcpy(l_new->key, key, key_size); + if (percpu) { + /* round up value_size to 8 bytes */ + size = round_up(size, 8); + + if (prealloc) { + pptr = htab_elem_get_ptr(l_new, key_size); + } else { + /* alloc_percpu zero-fills */ + pptr = __alloc_percpu_gfp(size, 8, + GFP_ATOMIC | __GFP_NOWARN); + if (!pptr) { + kfree(l_new); + return ERR_PTR(-ENOMEM); + } + } + + if (!onallcpus) { + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + } else { + int off = 0, cpu; + + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } + if (!prealloc) + htab_elem_set_ptr(l_new, key_size, pptr); + } else { + memcpy(l_new->key + round_up(key_size, 8), value, size); + } + + l_new->hash = hash; + return l_new; +} + +static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, + u64 map_flags) +{ + if (l_old && map_flags == BPF_NOEXIST) + /* elem already exists */ + return -EEXIST; + + if (!l_old && map_flags == BPF_EXIST) + /* elem doesn't exist, cannot update it */ + return -ENOENT; + + return 0; +} + /* Called from syscall or from eBPF program */ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new, *l_old; + struct htab_elem *l_new = NULL, *l_old; struct hlist_head *head; - struct bucket *b; unsigned long flags; - u32 key_size; + struct bucket *b; + u32 key_size, hash; int ret; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held()); - /* allocate new element outside of lock */ - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return -ENOMEM; - key_size = map->key_size; - memcpy(l_new->key, key, key_size); - memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); + hash = htab_map_hash(key, key_size); - l_new->hash = htab_map_hash(l_new->key, key_size); - b = __select_bucket(htab, l_new->hash); + b = __select_bucket(htab, hash); head = &b->head; /* bpf_map_update_elem() can be called in_irq() */ raw_spin_lock_irqsave(&b->lock, flags); - l_old = lookup_elem_raw(head, l_new->hash, key, key_size); + l_old = lookup_elem_raw(head, hash, key, key_size); - if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) { - /* if elem with this 'key' doesn't exist and we've reached - * max_entries limit, fail insertion of new elem - */ - ret = -E2BIG; + ret = check_flags(htab, l_old, map_flags); + if (ret) goto err; - } - if (l_old && map_flags == BPF_NOEXIST) { - /* elem already exists */ - ret = -EEXIST; + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); + if (IS_ERR(l_new)) { + /* all pre-allocated elements are in use or memory exhausted */ + ret = PTR_ERR(l_new); goto err; } - if (!l_old && map_flags == BPF_EXIST) { - /* elem doesn't exist, cannot update it */ - ret = -ENOENT; - goto err; - } - - /* add new element to the head of the list, so that concurrent - * search will find it before old elem + /* add new element to the head of the list, so that + * concurrent search will find it before old elem */ hlist_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_del_rcu(&l_old->hash_node); - kfree_rcu(l_old, rcu); - } else { - atomic_inc(&htab->count); + free_htab_elem(htab, l_old); } + ret = 0; +err: raw_spin_unlock_irqrestore(&b->lock, flags); + return ret; +} - return 0; +static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new = NULL, *l_old; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + if (l_old) { + void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); + u32 size = htab->map.value_size; + + /* per-cpu hash map can update value in-place */ + if (!onallcpus) { + memcpy(this_cpu_ptr(pptr), value, size); + } else { + int off = 0, cpu; + + size = round_up(size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } + } else { + l_new = alloc_htab_elem(htab, key, value, key_size, + hash, true, onallcpus); + if (IS_ERR(l_new)) { + ret = PTR_ERR(l_new); + goto err; + } + hlist_add_head_rcu(&l_new->hash_node, head); + } + ret = 0; err: raw_spin_unlock_irqrestore(&b->lock, flags); - kfree(l_new); return ret; } +static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + return __htab_percpu_map_update_elem(map, key, value, map_flags, false); +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { @@ -331,8 +607,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) if (l) { hlist_del_rcu(&l->hash_node); - atomic_dec(&htab->count); - kfree_rcu(l, rcu); + free_htab_elem(htab, l); ret = 0; } @@ -351,12 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - atomic_dec(&htab->count); - kfree(l); + htab_elem_free(htab, l); } } } - /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -369,10 +642,16 @@ static void htab_map_free(struct bpf_map *map) */ synchronize_rcu(); - /* some of kfree_rcu() callbacks for elements of this map may not have - * executed. It's ok. Proceed to free residual elements and map itself + /* some of free_htab_elem() callbacks for elements of this map may + * not have executed. Wait for them. */ - delete_all_elements(htab); + rcu_barrier(); + if (htab->map.map_flags & BPF_F_NO_PREALLOC) { + delete_all_elements(htab); + } else { + htab_free_elems(htab); + pcpu_freelist_destroy(&htab->freelist); + } kvfree(htab->buckets); kfree(htab); } @@ -391,9 +670,76 @@ static struct bpf_map_type_list htab_type __read_mostly = { .type = BPF_MAP_TYPE_HASH, }; +/* Called from eBPF program */ +static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) + return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); + else + return NULL; +} + +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) +{ + struct htab_elem *l; + void __percpu *pptr; + int ret = -ENOENT; + int cpu, off = 0; + u32 size; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + l = __htab_map_lookup_elem(map, key); + if (!l) + goto out; + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(pptr, cpu), size); + off += size; + } + ret = 0; +out: + rcu_read_unlock(); + return ret; +} + +int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + int ret; + + rcu_read_lock(); + ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); + rcu_read_unlock(); + + return ret; +} + +static const struct bpf_map_ops htab_percpu_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_update_elem = htab_percpu_map_update_elem, + .map_delete_elem = htab_map_delete_elem, +}; + +static struct bpf_map_type_list htab_percpu_type __read_mostly = { + .ops = &htab_percpu_ops, + .type = BPF_MAP_TYPE_PERCPU_HASH, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); + bpf_register_map_type(&htab_percpu_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4504ca66118d..ad7a0573f71b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -163,17 +163,26 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) struct task_struct *task = current; char *buf = (char *) (long) r1; - if (!task) - return -EINVAL; + if (unlikely(!task)) + goto err_clear; - memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); + strncpy(buf, task->comm, size); + + /* Verifier guarantees that size > 0. For task->comm exceeding + * size, guarantee that buf is %NUL-terminated. Unconditionally + * done here to save the size test. + */ + buf[size - 1] = 0; return 0; +err_clear: + memset(buf, 0, size); + return -EINVAL; } const struct bpf_func_proto bpf_get_current_comm_proto = { .func = bpf_get_current_comm, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_STACK, + .arg1_type = ARG_PTR_TO_RAW_STACK, .arg2_type = ARG_CONST_STACK_SIZE, }; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index f2ece3c174a5..71b75d9c81da 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -31,10 +31,10 @@ static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { case BPF_TYPE_PROG: - atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); + raw = bpf_prog_inc(raw); break; case BPF_TYPE_MAP: - bpf_map_inc(raw, true); + raw = bpf_map_inc(raw, true); break; default: WARN_ON_ONCE(1); @@ -119,18 +119,10 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) return 0; } -static bool bpf_dname_reserved(const struct dentry *dentry) -{ - return strchr(dentry->d_name.name, '.'); -} - static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - if (bpf_dname_reserved(dentry)) - return -EPERM; - inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -152,9 +144,6 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, { struct inode *inode; - if (bpf_dname_reserved(dentry)) - return -EPERM; - inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -187,31 +176,21 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, } } -static int bpf_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry) +static struct dentry * +bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { - if (bpf_dname_reserved(new_dentry)) - return -EPERM; - - return simple_link(old_dentry, dir, new_dentry); -} - -static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - if (bpf_dname_reserved(new_dentry)) - return -EPERM; - - return simple_rename(old_dir, old_dentry, new_dir, new_dentry); + if (strchr(dentry->d_name.name, '.')) + return ERR_PTR(-EPERM); + return simple_lookup(dir, dentry, flags); } static const struct inode_operations bpf_dir_iops = { - .lookup = simple_lookup, + .lookup = bpf_lookup, .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .rmdir = simple_rmdir, - .rename = bpf_rename, - .link = bpf_link, + .rename = simple_rename, + .link = simple_link, .unlink = simple_unlink, }; @@ -297,7 +276,8 @@ static void *bpf_obj_do_get(const struct filename *pathname, goto out; raw = bpf_any_get(inode->i_private, *type); - touch_atime(&path); + if (!IS_ERR(raw)) + touch_atime(&path); path_put(&path); return raw; diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c new file mode 100644 index 000000000000..5c51d1985b51 --- /dev/null +++ b/kernel/bpf/percpu_freelist.c @@ -0,0 +1,100 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include "percpu_freelist.h" + +int pcpu_freelist_init(struct pcpu_freelist *s) +{ + int cpu; + + s->freelist = alloc_percpu(struct pcpu_freelist_head); + if (!s->freelist) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); + + raw_spin_lock_init(&head->lock); + head->first = NULL; + } + return 0; +} + +void pcpu_freelist_destroy(struct pcpu_freelist *s) +{ + free_percpu(s->freelist); +} + +static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head, + struct pcpu_freelist_node *node) +{ + raw_spin_lock(&head->lock); + node->next = head->first; + head->first = node; + raw_spin_unlock(&head->lock); +} + +void pcpu_freelist_push(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); + + __pcpu_freelist_push(head, node); +} + +void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, + u32 nr_elems) +{ + struct pcpu_freelist_head *head; + unsigned long flags; + int i, cpu, pcpu_entries; + + pcpu_entries = nr_elems / num_possible_cpus() + 1; + i = 0; + + /* disable irq to workaround lockdep false positive + * in bpf usage pcpu_freelist_populate() will never race + * with pcpu_freelist_push() + */ + local_irq_save(flags); + for_each_possible_cpu(cpu) { +again: + head = per_cpu_ptr(s->freelist, cpu); + __pcpu_freelist_push(head, buf); + i++; + buf += elem_size; + if (i == nr_elems) + break; + if (i % pcpu_entries) + goto again; + } + local_irq_restore(flags); +} + +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +{ + struct pcpu_freelist_head *head; + struct pcpu_freelist_node *node; + int orig_cpu, cpu; + + orig_cpu = cpu = raw_smp_processor_id(); + while (1) { + head = per_cpu_ptr(s->freelist, cpu); + raw_spin_lock(&head->lock); + node = head->first; + if (node) { + head->first = node->next; + raw_spin_unlock(&head->lock); + return node; + } + raw_spin_unlock(&head->lock); + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = 0; + if (cpu == orig_cpu) + return NULL; + } +} diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h new file mode 100644 index 000000000000..3049aae8ea1e --- /dev/null +++ b/kernel/bpf/percpu_freelist.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __PERCPU_FREELIST_H__ +#define __PERCPU_FREELIST_H__ +#include <linux/spinlock.h> +#include <linux/percpu.h> + +struct pcpu_freelist_head { + struct pcpu_freelist_node *first; + raw_spinlock_t lock; +}; + +struct pcpu_freelist { + struct pcpu_freelist_head __percpu *freelist; +}; + +struct pcpu_freelist_node { + struct pcpu_freelist_node *next; +}; + +void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *); +void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, + u32 nr_elems); +int pcpu_freelist_init(struct pcpu_freelist *); +void pcpu_freelist_destroy(struct pcpu_freelist *s); +#endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c new file mode 100644 index 000000000000..c8ee35287bfe --- /dev/null +++ b/kernel/bpf/stackmap.c @@ -0,0 +1,290 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/bpf.h> +#include <linux/jhash.h> +#include <linux/filter.h> +#include <linux/vmalloc.h> +#include <linux/stacktrace.h> +#include <linux/perf_event.h> +#include "percpu_freelist.h" + +struct stack_map_bucket { + struct pcpu_freelist_node fnode; + u32 hash; + u32 nr; + u64 ip[]; +}; + +struct bpf_stack_map { + struct bpf_map map; + void *elems; + struct pcpu_freelist freelist; + u32 n_buckets; + struct stack_map_bucket *buckets[]; +}; + +static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) +{ + u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; + int err; + + smap->elems = vzalloc(elem_size * smap->map.max_entries); + if (!smap->elems) + return -ENOMEM; + + err = pcpu_freelist_init(&smap->freelist); + if (err) + goto free_elems; + + pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, + smap->map.max_entries); + return 0; + +free_elems: + vfree(smap->elems); + return err; +} + +/* Called from syscall */ +static struct bpf_map *stack_map_alloc(union bpf_attr *attr) +{ + u32 value_size = attr->value_size; + struct bpf_stack_map *smap; + u64 cost, n_buckets; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if (attr->map_flags) + return ERR_PTR(-EINVAL); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + value_size < 8 || value_size % 8 || + value_size / 8 > sysctl_perf_event_max_stack) + return ERR_PTR(-EINVAL); + + /* hash table size must be power of 2 */ + n_buckets = roundup_pow_of_two(attr->max_entries); + + cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + + smap = kzalloc(cost, GFP_USER | __GFP_NOWARN); + if (!smap) { + smap = vzalloc(cost); + if (!smap) + return ERR_PTR(-ENOMEM); + } + + err = -E2BIG; + cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_smap; + + smap->map.map_type = attr->map_type; + smap->map.key_size = attr->key_size; + smap->map.value_size = value_size; + smap->map.max_entries = attr->max_entries; + smap->n_buckets = n_buckets; + smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = bpf_map_precharge_memlock(smap->map.pages); + if (err) + goto free_smap; + + err = get_callchain_buffers(); + if (err) + goto free_smap; + + err = prealloc_elems_and_freelist(smap); + if (err) + goto put_buffers; + + return &smap->map; + +put_buffers: + put_callchain_buffers(); +free_smap: + kvfree(smap); + return ERR_PTR(err); +} + +u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +{ + struct pt_regs *regs = (struct pt_regs *) (long) r1; + struct bpf_map *map = (struct bpf_map *) (long) r2; + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct perf_callchain_entry *trace; + struct stack_map_bucket *bucket, *new_bucket, *old_bucket; + u32 max_depth = map->value_size / 8; + /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ + u32 init_nr = sysctl_perf_event_max_stack - max_depth; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 hash, id, trace_nr, trace_len; + bool user = flags & BPF_F_USER_STACK; + bool kernel = !user; + u64 *ips; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + trace = get_perf_callchain(regs, init_nr, kernel, user, false, false); + + if (unlikely(!trace)) + /* couldn't fetch the stack trace */ + return -EFAULT; + + /* get_perf_callchain() guarantees that trace->nr >= init_nr + * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth + */ + trace_nr = trace->nr - init_nr; + + if (trace_nr <= skip) + /* skipping more than usable stack trace */ + return -EFAULT; + + trace_nr -= skip; + trace_len = trace_nr * sizeof(u64); + ips = trace->ip + skip + init_nr; + hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); + id = hash & (smap->n_buckets - 1); + bucket = READ_ONCE(smap->buckets[id]); + + if (bucket && bucket->hash == hash) { + if (flags & BPF_F_FAST_STACK_CMP) + return id; + if (bucket->nr == trace_nr && + memcmp(bucket->ip, ips, trace_len) == 0) + return id; + } + + /* this call stack is not in the map, try to add it */ + if (bucket && !(flags & BPF_F_REUSE_STACKID)) + return -EEXIST; + + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + + memcpy(new_bucket->ip, ips, trace_len); + new_bucket->hash = hash; + new_bucket->nr = trace_nr; + + old_bucket = xchg(&smap->buckets[id], new_bucket); + if (old_bucket) + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return id; +} + +const struct bpf_func_proto bpf_get_stackid_proto = { + .func = bpf_get_stackid, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +/* Called from eBPF program */ +static void *stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* Called from syscall */ +int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *bucket, *old_bucket; + u32 id = *(u32 *)key, trace_len; + + if (unlikely(id >= smap->n_buckets)) + return -ENOENT; + + bucket = xchg(&smap->buckets[id], NULL); + if (!bucket) + return -ENOENT; + + trace_len = bucket->nr * sizeof(u64); + memcpy(value, bucket->ip, trace_len); + memset(value + trace_len, 0, map->value_size - trace_len); + + old_bucket = xchg(&smap->buckets[id], bucket); + if (old_bucket) + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return 0; +} + +static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EINVAL; +} + +static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return -EINVAL; +} + +/* Called from syscall or from eBPF program */ +static int stack_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *old_bucket; + u32 id = *(u32 *)key; + + if (unlikely(id >= smap->n_buckets)) + return -E2BIG; + + old_bucket = xchg(&smap->buckets[id], NULL); + if (old_bucket) { + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return 0; + } else { + return -ENOENT; + } +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void stack_map_free(struct bpf_map *map) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + + /* wait for bpf programs to complete before freeing stack map */ + synchronize_rcu(); + + vfree(smap->elems); + pcpu_freelist_destroy(&smap->freelist); + kvfree(smap); + put_callchain_buffers(); +} + +static const struct bpf_map_ops stack_map_ops = { + .map_alloc = stack_map_alloc, + .map_free = stack_map_free, + .map_get_next_key = stack_map_get_next_key, + .map_lookup_elem = stack_map_lookup_elem, + .map_update_elem = stack_map_update_elem, + .map_delete_elem = stack_map_delete_elem, +}; + +static struct bpf_map_type_list stack_map_type __read_mostly = { + .ops = &stack_map_ops, + .type = BPF_MAP_TYPE_STACK_TRACE, +}; + +static int __init register_stack_map(void) +{ + bpf_register_map_type(&stack_map_type); + return 0; +} +late_initcall(register_stack_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 637397059f76..46ecce4b79ed 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -18,6 +18,8 @@ #include <linux/filter.h> #include <linux/version.h> +DEFINE_PER_CPU(int, bpf_prog_active); + int sysctl_unprivileged_bpf_disabled __read_mostly; static LIST_HEAD(bpf_map_types); @@ -46,6 +48,19 @@ void bpf_register_map_type(struct bpf_map_type_list *tl) list_add(&tl->list_node, &bpf_map_types); } +int bpf_map_precharge_memlock(u32 pages) +{ + struct user_struct *user = get_current_user(); + unsigned long memlock_limit, cur; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + cur = atomic_long_read(&user->locked_vm); + free_uid(user); + if (cur + pages > memlock_limit) + return -EPERM; + return 0; +} + static int bpf_map_charge_memlock(struct bpf_map *map) { struct user_struct *user = get_current_user(); @@ -122,11 +137,13 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "map_type:\t%u\n" "key_size:\t%u\n" "value_size:\t%u\n" - "max_entries:\t%u\n", + "max_entries:\t%u\n" + "map_flags:\t%#x\n", map->map_type, map->key_size, map->value_size, - map->max_entries); + map->max_entries, + map->map_flags); } #endif @@ -151,7 +168,7 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD max_entries +#define BPF_MAP_CREATE_LAST_FIELD map_flags /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -201,11 +218,18 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -void bpf_map_inc(struct bpf_map *map, bool uref) +/* prog's and map's refcnt limit */ +#define BPF_MAX_REFCNT 32768 + +struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) { - atomic_inc(&map->refcnt); + if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) { + atomic_dec(&map->refcnt); + return ERR_PTR(-EBUSY); + } if (uref) atomic_inc(&map->usercnt); + return map; } struct bpf_map *bpf_map_get_with_uref(u32 ufd) @@ -217,7 +241,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) if (IS_ERR(map)) return map; - bpf_map_inc(map, true); + map = bpf_map_inc(map, true); fdput(f); return map; @@ -229,6 +253,11 @@ static void __user *u64_to_ptr(__u64 val) return (void __user *) (unsigned long) val; } +int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +{ + return -ENOTSUPP; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value @@ -239,6 +268,7 @@ static int map_lookup_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value, *ptr; + u32 value_size; struct fd f; int err; @@ -259,23 +289,37 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; - rcu_read_lock(); - ptr = map->ops->map_lookup_elem(map, key); - if (ptr) - memcpy(value, ptr, map->value_size); - rcu_read_unlock(); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { + err = bpf_stackmap_copy(map, key, value); + } else { + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, value_size); + rcu_read_unlock(); + err = ptr ? 0 : -ENOENT; + } - err = -ENOENT; - if (!ptr) + if (err) goto free_value; err = -EFAULT; - if (copy_to_user(uvalue, value, map->value_size) != 0) + if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; err = 0; @@ -298,6 +342,7 @@ static int map_update_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; + u32 value_size; struct fd f; int err; @@ -318,21 +363,37 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; err = -EFAULT; - if (copy_from_user(value, uvalue, map->value_size) != 0) + if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; - /* eBPF program that use maps are running under rcu_read_lock(), - * therefore all map accessors rely on this fact, so do the same here + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from + * inside bpf map update or delete otherwise deadlocks are possible */ - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, attr->flags); - rcu_read_unlock(); + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_update(map, key, value, attr->flags); + } else { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, attr->flags); + rcu_read_unlock(); + } + __this_cpu_dec(bpf_prog_active); + preempt_enable(); free_value: kfree(value); @@ -371,9 +432,13 @@ static int map_delete_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + preempt_disable(); + __this_cpu_inc(bpf_prog_active); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); free_key: kfree(key); @@ -600,6 +665,15 @@ static struct bpf_prog *__bpf_prog_get(struct fd f) return f.file->private_data; } +struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { + atomic_dec(&prog->aux->refcnt); + return ERR_PTR(-EBUSY); + } + return prog; +} + /* called by sockets/tracing/seccomp before attaching program to an event * pairs with bpf_prog_put() */ @@ -612,7 +686,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) if (IS_ERR(prog)) return prog; - atomic_inc(&prog->aux->refcnt); + prog = bpf_prog_inc(prog); fdput(f); return prog; @@ -688,7 +762,7 @@ static int bpf_prog_load(union bpf_attr *attr) fixup_bpf_calls(prog); /* eBPF program is ready to be JITed */ - err = bpf_prog_select_runtime(prog); + prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2e7f7ab739e4..a08d66215245 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -136,13 +137,32 @@ enum bpf_reg_type { FRAME_PTR, /* reg == frame_pointer */ PTR_TO_STACK, /* reg == frame_pointer + imm */ CONST_IMM, /* constant integer value */ + + /* PTR_TO_PACKET represents: + * skb->data + * skb->data + imm + * skb->data + (u16) var + * skb->data + (u16) var + imm + * if (range > 0) then [ptr, ptr + range - off) is safe to access + * if (id > 0) means that some 'var' was added + * if (off > 0) menas that 'imm' was added + */ + PTR_TO_PACKET, + PTR_TO_PACKET_END, /* skb->data + headlen */ }; struct reg_state { enum bpf_reg_type type; union { - /* valid when type == CONST_IMM | PTR_TO_STACK */ - int imm; + /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ + s64 imm; + + /* valid when type == PTR_TO_PACKET* */ + struct { + u32 id; + u16 off; + u16 range; + }; /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL @@ -202,6 +222,16 @@ struct verifier_env { bool allow_ptr_leaks; }; +#define BPF_COMPLEXITY_LIMIT_INSNS 65536 +#define BPF_COMPLEXITY_LIMIT_STACK 1024 + +struct bpf_call_arg_meta { + struct bpf_map *map_ptr; + bool raw_mode; + int regno; + int access_size; +}; + /* verbose verifier prints what it's seeing * bpf_check() is called under lock, so no race to access these global vars */ @@ -237,39 +267,39 @@ static const char * const reg_type_str[] = { [FRAME_PTR] = "fp", [PTR_TO_STACK] = "fp", [CONST_IMM] = "imm", + [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_END] = "pkt_end", }; -static const struct { - int map_type; - int func_id; -} func_limit[] = { - {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, - {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, - {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, -}; - -static void print_verifier_state(struct verifier_env *env) +static void print_verifier_state(struct verifier_state *state) { + struct reg_state *reg; enum bpf_reg_type t; int i; for (i = 0; i < MAX_BPF_REG; i++) { - t = env->cur_state.regs[i].type; + reg = &state->regs[i]; + t = reg->type; if (t == NOT_INIT) continue; verbose(" R%d=%s", i, reg_type_str[t]); if (t == CONST_IMM || t == PTR_TO_STACK) - verbose("%d", env->cur_state.regs[i].imm); + verbose("%lld", reg->imm); + else if (t == PTR_TO_PACKET) + verbose("(id=%d,off=%d,r=%d)", + reg->id, reg->off, reg->range); + else if (t == UNKNOWN_VALUE && reg->imm) + verbose("%lld", reg->imm); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) verbose("(ks=%d,vs=%d)", - env->cur_state.regs[i].map_ptr->key_size, - env->cur_state.regs[i].map_ptr->value_size); + reg->map_ptr->key_size, + reg->map_ptr->value_size); } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (env->cur_state.stack_slot_type[i] == STACK_SPILL) + if (state->stack_slot_type[i] == STACK_SPILL) verbose(" fp%d=%s", -MAX_BPF_STACK + i, - reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]); + reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]); } verbose("\n"); } @@ -453,7 +483,7 @@ static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, elem->next = env->head; env->head = elem; env->stack_size++; - if (env->stack_size > 1024) { + if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { verbose("BPF program is too complex\n"); goto err; } @@ -476,7 +506,6 @@ static void init_reg_state(struct reg_state *regs) for (i = 0; i < MAX_BPF_REG; i++) { regs[i].type = NOT_INIT; regs[i].imm = 0; - regs[i].map_ptr = NULL; } /* frame pointer */ @@ -491,7 +520,6 @@ static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) BUG_ON(regno >= MAX_BPF_REG); regs[regno].type = UNKNOWN_VALUE; regs[regno].imm = 0; - regs[regno].map_ptr = NULL; } enum reg_arg_type { @@ -547,6 +575,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_MAP_VALUE_OR_NULL: case PTR_TO_STACK: case PTR_TO_CTX: + case PTR_TO_PACKET: + case PTR_TO_PACKET_END: case FRAME_PTR: case CONST_PTR_TO_MAP: return true; @@ -646,13 +676,38 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, return 0; } +#define MAX_PACKET_OFF 0xffff + +static int check_packet_access(struct verifier_env *env, u32 regno, int off, + int size) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *reg = ®s[regno]; + int linear_size = (int) reg->range - (int) reg->off; + + if (linear_size < 0 || linear_size >= MAX_PACKET_OFF) { + verbose("verifier bug\n"); + return -EFAULT; + } + if (off < 0 || off + size > linear_size) { + verbose("invalid access to packet, off=%d size=%d, allowed=%d\n", + off, size, linear_size); + return -EACCES; + } + return 0; +} + /* check access to 'struct bpf_context' fields */ static int check_ctx_access(struct verifier_env *env, int off, int size, enum bpf_access_type t) { if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t)) + env->prog->aux->ops->is_valid_access(off, size, t)) { + /* remember the offset of last byte accessed in ctx */ + if (env->prog->aux->max_ctx_offset < off + size) + env->prog->aux->max_ctx_offset = off + size; return 0; + } verbose("invalid bpf_context access off=%d size=%d\n", off, size); return -EACCES; @@ -672,6 +727,45 @@ static bool is_pointer_value(struct verifier_env *env, int regno) } } +static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, + int off, int size) +{ + if (reg->type != PTR_TO_PACKET) { + if (off % size != 0) { + verbose("misaligned access off %d size %d\n", off, size); + return -EACCES; + } else { + return 0; + } + } + + switch (env->prog->type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + break; + default: + verbose("verifier is misconfigured\n"); + return -EACCES; + } + + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + /* misaligned access to packet is ok on x86,arm,arm64 */ + return 0; + + if (reg->id && size != 1) { + verbose("Unknown packet alignment. Only byte-sized access allowed\n"); + return -EACCES; + } + + /* skb->data is NET_IP_ALIGN-ed */ + if ((NET_IP_ALIGN + reg->off + off) % size != 0) { + verbose("misaligned packet access off %d+%d+%d size %d\n", + NET_IP_ALIGN, reg->off, off, size); + return -EACCES; + } + return 0; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -683,21 +777,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, int value_regno) { struct verifier_state *state = &env->cur_state; + struct reg_state *reg = &state->regs[regno]; int size, err = 0; - if (state->regs[regno].type == PTR_TO_STACK) - off += state->regs[regno].imm; + if (reg->type == PTR_TO_STACK) + off += reg->imm; size = bpf_size_to_bytes(bpf_size); if (size < 0) return size; - if (off % size != 0) { - verbose("misaligned access off %d size %d\n", off, size); - return -EACCES; - } + err = check_ptr_alignment(env, reg, off, size); + if (err) + return err; - if (state->regs[regno].type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into map\n", value_regno); @@ -707,18 +801,25 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); - } else if (state->regs[regno].type == PTR_TO_CTX) { + } else if (reg->type == PTR_TO_CTX) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into ctx\n", value_regno); return -EACCES; } err = check_ctx_access(env, off, size, t); - if (!err && t == BPF_READ && value_regno >= 0) + if (!err && t == BPF_READ && value_regno >= 0) { mark_reg_unknown_value(state->regs, value_regno); + if (off == offsetof(struct __sk_buff, data) && + env->allow_ptr_leaks) + /* note that reg.[id|off|range] == 0 */ + state->regs[value_regno].type = PTR_TO_PACKET; + else if (off == offsetof(struct __sk_buff, data_end) && + env->allow_ptr_leaks) + state->regs[value_regno].type = PTR_TO_PACKET_END; + } - } else if (state->regs[regno].type == FRAME_PTR || - state->regs[regno].type == PTR_TO_STACK) { + } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { if (off >= 0 || off < -MAX_BPF_STACK) { verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; @@ -734,11 +835,28 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, } else { err = check_stack_read(state, off, size, value_regno); } + } else if (state->regs[regno].type == PTR_TO_PACKET) { + if (t == BPF_WRITE) { + verbose("cannot write into packet\n"); + return -EACCES; + } + err = check_packet_access(env, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); } else { verbose("R%d invalid mem access '%s'\n", - regno, reg_type_str[state->regs[regno].type]); + regno, reg_type_str[reg->type]); return -EACCES; } + + if (!err && size <= 2 && value_regno >= 0 && env->allow_ptr_leaks && + state->regs[value_regno].type == UNKNOWN_VALUE) { + /* 1 or 2 byte load zero-extends, determine the number of + * zero upper bits. Not doing it fo 4 byte load, since + * such values cannot be added to ptr_to_packet anyway. + */ + state->regs[value_regno].imm = 64 - size * 8; + } return err; } @@ -778,15 +896,25 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized */ -static int check_stack_boundary(struct verifier_env *env, - int regno, int access_size) +static int check_stack_boundary(struct verifier_env *env, int regno, + int access_size, bool zero_size_allowed, + struct bpf_call_arg_meta *meta) { struct verifier_state *state = &env->cur_state; struct reg_state *regs = state->regs; int off, i; - if (regs[regno].type != PTR_TO_STACK) + if (regs[regno].type != PTR_TO_STACK) { + if (zero_size_allowed && access_size == 0 && + regs[regno].type == CONST_IMM && + regs[regno].imm == 0) + return 0; + + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[regs[regno].type], + reg_type_str[PTR_TO_STACK]); return -EACCES; + } off = regs[regno].imm; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || @@ -796,6 +924,12 @@ static int check_stack_boundary(struct verifier_env *env, return -EACCES; } + if (meta && meta->raw_mode) { + meta->access_size = access_size; + meta->regno = regno; + return 0; + } + for (i = 0; i < access_size; i++) { if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { verbose("invalid indirect read from stack off %d+%d size %d\n", @@ -807,7 +941,8 @@ static int check_stack_boundary(struct verifier_env *env, } static int check_func_arg(struct verifier_env *env, u32 regno, - enum bpf_arg_type arg_type, struct bpf_map **mapp) + enum bpf_arg_type arg_type, + struct bpf_call_arg_meta *meta) { struct reg_state *reg = env->cur_state.regs + regno; enum bpf_reg_type expected_type; @@ -829,15 +964,26 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return 0; } - if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || + if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - } else if (arg_type == ARG_CONST_STACK_SIZE) { + } else if (arg_type == ARG_CONST_STACK_SIZE || + arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { expected_type = CONST_IMM; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; } else if (arg_type == ARG_PTR_TO_CTX) { expected_type = PTR_TO_CTX; + } else if (arg_type == ARG_PTR_TO_STACK || + arg_type == ARG_PTR_TO_RAW_STACK) { + expected_type = PTR_TO_STACK; + /* One exception here. In case function allows for NULL to be + * passed in as argument, it's a CONST_IMM type. Final test + * happens during stack boundary checking. + */ + if (reg->type == CONST_IMM && reg->imm == 0) + expected_type = CONST_IMM; + meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -851,14 +997,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ - *mapp = reg->map_ptr; - + meta->map_ptr = reg->map_ptr; } else if (arg_type == ARG_PTR_TO_MAP_KEY) { /* bpf_map_xxx(..., map_ptr, ..., key) call: * check that [key, key + map->key_size) are within * stack limits and initialized */ - if (!*mapp) { + if (!meta->map_ptr) { /* in function declaration map_ptr must come before * map_key, so that it's verified and known before * we have to check map_key here. Otherwise it means @@ -867,20 +1012,24 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->key_size); - + err = check_stack_boundary(env, regno, meta->map_ptr->key_size, + false, NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ - if (!*mapp) { + if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->value_size); + err = check_stack_boundary(env, regno, + meta->map_ptr->value_size, + false, NULL); + } else if (arg_type == ARG_CONST_STACK_SIZE || + arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { + bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); - } else if (arg_type == ARG_CONST_STACK_SIZE) { /* bpf_xxx(..., buf, len) call will access 'len' bytes * from stack pointer 'buf'. Check it * note: regno == len, regno - 1 == buf @@ -890,7 +1039,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); return -EACCES; } - err = check_stack_boundary(env, regno - 1, reg->imm); + err = check_stack_boundary(env, regno - 1, reg->imm, + zero_size_allowed, meta); } return err; @@ -898,24 +1048,93 @@ static int check_func_arg(struct verifier_env *env, u32 regno, static int check_map_func_compatibility(struct bpf_map *map, int func_id) { - bool bool_map, bool_func; - int i; - if (!map) return 0; - for (i = 0; i < ARRAY_SIZE(func_limit); i++) { - bool_map = (map->map_type == func_limit[i].map_type); - bool_func = (func_id == func_limit[i].func_id); - /* only when map & func pair match it can continue. - * don't allow any other map type to be passed into - * the special func; - */ - if (bool_func && bool_map != bool_func) - return -EINVAL; + /* We need a two way check, first is from map perspective ... */ + switch (map->map_type) { + case BPF_MAP_TYPE_PROG_ARRAY: + if (func_id != BPF_FUNC_tail_call) + goto error; + break; + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + if (func_id != BPF_FUNC_perf_event_read && + func_id != BPF_FUNC_perf_event_output) + goto error; + break; + case BPF_MAP_TYPE_STACK_TRACE: + if (func_id != BPF_FUNC_get_stackid) + goto error; + break; + default: + break; + } + + /* ... and second from the function itself. */ + switch (func_id) { + case BPF_FUNC_tail_call: + if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + goto error; + break; + case BPF_FUNC_perf_event_read: + case BPF_FUNC_perf_event_output: + if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) + goto error; + break; + case BPF_FUNC_get_stackid: + if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) + goto error; + break; + default: + break; } return 0; +error: + verbose("cannot pass map_type %d into func %d\n", + map->map_type, func_id); + return -EINVAL; +} + +static int check_raw_mode(const struct bpf_func_proto *fn) +{ + int count = 0; + + if (fn->arg1_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg2_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg3_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg4_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg5_type == ARG_PTR_TO_RAW_STACK) + count++; + + return count > 1 ? -EINVAL : 0; +} + +static void clear_all_pkt_pointers(struct verifier_env *env) +{ + struct verifier_state *state = &env->cur_state; + struct reg_state *regs = state->regs, *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + if (regs[i].type == PTR_TO_PACKET || + regs[i].type == PTR_TO_PACKET_END) + mark_reg_unknown_value(regs, i); + + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (state->stack_slot_type[i] != STACK_SPILL) + continue; + reg = &state->spilled_regs[i / BPF_REG_SIZE]; + if (reg->type != PTR_TO_PACKET && + reg->type != PTR_TO_PACKET_END) + continue; + reg->type = UNKNOWN_VALUE; + reg->imm = 0; + } } static int check_call(struct verifier_env *env, int func_id) @@ -923,8 +1142,9 @@ static int check_call(struct verifier_env *env, int func_id) struct verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; struct reg_state *regs = state->regs; - struct bpf_map *map = NULL; struct reg_state *reg; + struct bpf_call_arg_meta meta; + bool changes_data; int i, err; /* find function prototype */ @@ -947,23 +1167,45 @@ static int check_call(struct verifier_env *env, int func_id) return -EINVAL; } + changes_data = bpf_helper_changes_skb_data(fn->func); + + memset(&meta, 0, sizeof(meta)); + + /* We only support one arg being in raw mode at the moment, which + * is sufficient for the helper functions we have right now. + */ + err = check_raw_mode(fn); + if (err) { + verbose("kernel subsystem misconfigured func %d\n", func_id); + return err; + } + /* check args */ - err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map); + err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map); + err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map); + err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map); + err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map); + err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta); if (err) return err; + /* Mark slots with STACK_MISC in case of raw mode, stack offset + * is inferred from register state. + */ + for (i = 0; i < meta.access_size; i++) { + err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1); + if (err) + return err; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { reg = regs + caller_saved[i]; @@ -982,28 +1224,211 @@ static int check_call(struct verifier_env *env, int func_id) * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() */ - if (map == NULL) { + if (meta.map_ptr == NULL) { verbose("kernel subsystem misconfigured verifier\n"); return -EINVAL; } - regs[BPF_REG_0].map_ptr = map; + regs[BPF_REG_0].map_ptr = meta.map_ptr; } else { verbose("unknown return type %d of func %d\n", fn->ret_type, func_id); return -EINVAL; } - err = check_map_func_compatibility(map, func_id); + err = check_map_func_compatibility(meta.map_ptr, func_id); if (err) return err; + if (changes_data) + clear_all_pkt_pointers(env); + return 0; +} + +static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *dst_reg = ®s[insn->dst_reg]; + struct reg_state *src_reg = ®s[insn->src_reg]; + s32 imm; + + if (BPF_SRC(insn->code) == BPF_K) { + /* pkt_ptr += imm */ + imm = insn->imm; + +add_imm: + if (imm <= 0) { + verbose("addition of negative constant to packet pointer is not allowed\n"); + return -EACCES; + } + if (imm >= MAX_PACKET_OFF || + imm + dst_reg->off >= MAX_PACKET_OFF) { + verbose("constant %d is too large to add to packet pointer\n", + imm); + return -EACCES; + } + /* a constant was added to pkt_ptr. + * Remember it while keeping the same 'id' + */ + dst_reg->off += imm; + } else { + if (src_reg->type == CONST_IMM) { + /* pkt_ptr += reg where reg is known constant */ + imm = src_reg->imm; + goto add_imm; + } + /* disallow pkt_ptr += reg + * if reg is not uknown_value with guaranteed zero upper bits + * otherwise pkt_ptr may overflow and addition will become + * subtraction which is not allowed + */ + if (src_reg->type != UNKNOWN_VALUE) { + verbose("cannot add '%s' to ptr_to_packet\n", + reg_type_str[src_reg->type]); + return -EACCES; + } + if (src_reg->imm < 48) { + verbose("cannot add integer value with %lld upper zero bits to ptr_to_packet\n", + src_reg->imm); + return -EACCES; + } + /* dst_reg stays as pkt_ptr type and since some positive + * integer value was added to the pointer, increment its 'id' + */ + dst_reg->id++; + + /* something was added to pkt_ptr, set range and off to zero */ + dst_reg->off = 0; + dst_reg->range = 0; + } + return 0; +} + +static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *dst_reg = ®s[insn->dst_reg]; + u8 opcode = BPF_OP(insn->code); + s64 imm_log2; + + /* for type == UNKNOWN_VALUE: + * imm > 0 -> number of zero upper bits + * imm == 0 -> don't track which is the same as all bits can be non-zero + */ + + if (BPF_SRC(insn->code) == BPF_X) { + struct reg_state *src_reg = ®s[insn->src_reg]; + + if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 && + dst_reg->imm && opcode == BPF_ADD) { + /* dreg += sreg + * where both have zero upper bits. Adding them + * can only result making one more bit non-zero + * in the larger value. + * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47) + * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47) + */ + dst_reg->imm = min(dst_reg->imm, src_reg->imm); + dst_reg->imm--; + return 0; + } + if (src_reg->type == CONST_IMM && src_reg->imm > 0 && + dst_reg->imm && opcode == BPF_ADD) { + /* dreg += sreg + * where dreg has zero upper bits and sreg is const. + * Adding them can only result making one more bit + * non-zero in the larger value. + */ + imm_log2 = __ilog2_u64((long long)src_reg->imm); + dst_reg->imm = min(dst_reg->imm, 63 - imm_log2); + dst_reg->imm--; + return 0; + } + /* all other cases non supported yet, just mark dst_reg */ + dst_reg->imm = 0; + return 0; + } + + /* sign extend 32-bit imm into 64-bit to make sure that + * negative values occupy bit 63. Note ilog2() would have + * been incorrect, since sizeof(insn->imm) == 4 + */ + imm_log2 = __ilog2_u64((long long)insn->imm); + + if (dst_reg->imm && opcode == BPF_LSH) { + /* reg <<= imm + * if reg was a result of 2 byte load, then its imm == 48 + * which means that upper 48 bits are zero and shifting this reg + * left by 4 would mean that upper 44 bits are still zero + */ + dst_reg->imm -= insn->imm; + } else if (dst_reg->imm && opcode == BPF_MUL) { + /* reg *= imm + * if multiplying by 14 subtract 4 + * This is conservative calculation of upper zero bits. + * It's not trying to special case insn->imm == 1 or 0 cases + */ + dst_reg->imm -= imm_log2 + 1; + } else if (opcode == BPF_AND) { + /* reg &= imm */ + dst_reg->imm = 63 - imm_log2; + } else if (dst_reg->imm && opcode == BPF_ADD) { + /* reg += imm */ + dst_reg->imm = min(dst_reg->imm, 63 - imm_log2); + dst_reg->imm--; + } else if (opcode == BPF_RSH) { + /* reg >>= imm + * which means that after right shift, upper bits will be zero + * note that verifier already checked that + * 0 <= imm < 64 for shift insn + */ + dst_reg->imm += insn->imm; + if (unlikely(dst_reg->imm > 64)) + /* some dumb code did: + * r2 = *(u32 *)mem; + * r2 >>= 32; + * and all bits are zero now */ + dst_reg->imm = 64; + } else { + /* all other alu ops, means that we don't know what will + * happen to the value, mark it with unknown number of zero bits + */ + dst_reg->imm = 0; + } + + if (dst_reg->imm < 0) { + /* all 64 bits of the register can contain non-zero bits + * and such value cannot be added to ptr_to_packet, since it + * may overflow, mark it as unknown to avoid further eval + */ + dst_reg->imm = 0; + } + return 0; +} + +static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *dst_reg = ®s[insn->dst_reg]; + struct reg_state *src_reg = ®s[insn->src_reg]; + u8 opcode = BPF_OP(insn->code); + + /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. + * Don't care about overflow or negative values, just add them + */ + if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) + dst_reg->imm += insn->imm; + else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) + dst_reg->imm += src_reg->imm; + else + mark_reg_unknown_value(regs, insn->dst_reg); return 0; } /* check validity of 32-bit and 64-bit arithmetic operations */ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct reg_state *regs = env->cur_state.regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -1092,8 +1517,6 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) } else { /* all other ALU ops: and, sub, xor, add, ... */ - bool stack_relative = false; - if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { verbose("BPF_ALU uses reserved fields\n"); @@ -1131,11 +1554,34 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) } } + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + dst_reg = ®s[insn->dst_reg]; + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && - regs[insn->dst_reg].type == FRAME_PTR && - BPF_SRC(insn->code) == BPF_K) { - stack_relative = true; + dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) { + dst_reg->type = PTR_TO_STACK; + dst_reg->imm = insn->imm; + return 0; + } else if (opcode == BPF_ADD && + BPF_CLASS(insn->code) == BPF_ALU64 && + dst_reg->type == PTR_TO_PACKET) { + /* ptr_to_packet += K|X */ + return check_packet_ptr_add(env, insn); + } else if (BPF_CLASS(insn->code) == BPF_ALU64 && + dst_reg->type == UNKNOWN_VALUE && + env->allow_ptr_leaks) { + /* unknown += K|X */ + return evaluate_reg_alu(env, insn); + } else if (BPF_CLASS(insn->code) == BPF_ALU64 && + dst_reg->type == CONST_IMM && + env->allow_ptr_leaks) { + /* reg_imm += K|X */ + return evaluate_reg_imm_alu(env, insn); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer arithmetic prohibited\n", insn->dst_reg); @@ -1147,24 +1593,45 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return -EACCES; } - /* check dest operand */ - err = check_reg_arg(regs, insn->dst_reg, DST_OP); - if (err) - return err; - - if (stack_relative) { - regs[insn->dst_reg].type = PTR_TO_STACK; - regs[insn->dst_reg].imm = insn->imm; - } + /* mark dest operand */ + mark_reg_unknown_value(regs, insn->dst_reg); } return 0; } +static void find_good_pkt_pointers(struct verifier_env *env, + struct reg_state *dst_reg) +{ + struct verifier_state *state = &env->cur_state; + struct reg_state *regs = state->regs, *reg; + int i; + /* r2 = r3; + * r2 += 8 + * if (r2 > pkt_end) goto somewhere + * r2 == dst_reg, pkt_end == src_reg, + * r2=pkt(id=n,off=8,r=0) + * r3=pkt(id=n,off=0,r=0) + * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) + * so that range of bytes [r3, r3 + 8) is safe to access + */ + for (i = 0; i < MAX_BPF_REG; i++) + if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) + regs[i].range = dst_reg->off; + + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (state->stack_slot_type[i] != STACK_SPILL) + continue; + reg = &state->spilled_regs[i / BPF_REG_SIZE]; + if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) + reg->range = dst_reg->off; + } +} + static int check_cond_jmp_op(struct verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct reg_state *regs = env->cur_state.regs; + struct reg_state *regs = env->cur_state.regs, *dst_reg; struct verifier_state *other_branch; u8 opcode = BPF_OP(insn->code); int err; @@ -1202,11 +1669,12 @@ static int check_cond_jmp_op(struct verifier_env *env, if (err) return err; + dst_reg = ®s[insn->dst_reg]; + /* detect if R == 0 where R was initialized to zero earlier */ if (BPF_SRC(insn->code) == BPF_K && (opcode == BPF_JEQ || opcode == BPF_JNE) && - regs[insn->dst_reg].type == CONST_IMM && - regs[insn->dst_reg].imm == insn->imm) { + dst_reg->type == CONST_IMM && dst_reg->imm == insn->imm) { if (opcode == BPF_JEQ) { /* if (imm == imm) goto pc+off; * only follow the goto, ignore fall-through @@ -1228,44 +1696,30 @@ static int check_cond_jmp_op(struct verifier_env *env, /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ if (BPF_SRC(insn->code) == BPF_K && - insn->imm == 0 && (opcode == BPF_JEQ || - opcode == BPF_JNE) && - regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) { + insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && + dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { if (opcode == BPF_JEQ) { /* next fallthrough insn can access memory via * this register */ regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; /* branch targer cannot access it, since reg == 0 */ - other_branch->regs[insn->dst_reg].type = CONST_IMM; - other_branch->regs[insn->dst_reg].imm = 0; + mark_reg_unknown_value(other_branch->regs, + insn->dst_reg); } else { other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - regs[insn->dst_reg].type = CONST_IMM; - regs[insn->dst_reg].imm = 0; + mark_reg_unknown_value(regs, insn->dst_reg); } + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && + dst_reg->type == PTR_TO_PACKET && + regs[insn->src_reg].type == PTR_TO_PACKET_END) { + find_good_pkt_pointers(env, dst_reg); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; - } else if (BPF_SRC(insn->code) == BPF_K && - (opcode == BPF_JEQ || opcode == BPF_JNE)) { - - if (opcode == BPF_JEQ) { - /* detect if (R == imm) goto - * and in the target state recognize that R = imm - */ - other_branch->regs[insn->dst_reg].type = CONST_IMM; - other_branch->regs[insn->dst_reg].imm = insn->imm; - } else { - /* detect if (R != imm) goto - * and in the fall-through state recognize that R = imm - */ - regs[insn->dst_reg].type = CONST_IMM; - regs[insn->dst_reg].imm = insn->imm; - } } if (log_level) - print_verifier_state(env); + print_verifier_state(&env->cur_state); return 0; } @@ -1343,13 +1797,14 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) int i, err; if (!may_access_skb(env->prog->type)) { - verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n"); + verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); return -EINVAL; } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || + BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { - verbose("BPF_LD_ABS uses reserved fields\n"); + verbose("BPF_LD_[ABS|IND] uses reserved fields\n"); return -EINVAL; } @@ -1513,6 +1968,8 @@ peek_stack: goto peek_stack; else if (ret < 0) goto err_free; + if (t + 1 < insn_cnt) + env->explored_states[t + 1] = STATE_LIST_MARK; } else if (opcode == BPF_JA) { if (BPF_SRC(insns[t].code) != BPF_K) { ret = -EINVAL; @@ -1580,6 +2037,58 @@ err_free: return ret; } +/* the following conditions reduce the number of explored insns + * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet + */ +static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) +{ + if (old->id != cur->id) + return false; + + /* old ptr_to_packet is more conservative, since it allows smaller + * range. Ex: + * old(off=0,r=10) is equal to cur(off=0,r=20), because + * old(off=0,r=10) means that with range=10 the verifier proceeded + * further and found no issues with the program. Now we're in the same + * spot with cur(off=0,r=20), so we're safe too, since anything further + * will only be looking at most 10 bytes after this pointer. + */ + if (old->off == cur->off && old->range < cur->range) + return true; + + /* old(off=20,r=10) is equal to cur(off=22,re=22 or 5 or 0) + * since both cannot be used for packet access and safe(old) + * pointer has smaller off that could be used for further + * 'if (ptr > data_end)' check + * Ex: + * old(off=20,r=10) and cur(off=22,r=22) and cur(off=22,r=0) mean + * that we cannot access the packet. + * The safe range is: + * [ptr, ptr + range - off) + * so whenever off >=range, it means no safe bytes from this pointer. + * When comparing old->off <= cur->off, it means that older code + * went with smaller offset and that offset was later + * used to figure out the safe range after 'if (ptr > data_end)' check + * Say, 'old' state was explored like: + * ... R3(off=0, r=0) + * R4 = R3 + 20 + * ... now R4(off=20,r=0) <-- here + * if (R4 > data_end) + * ... R4(off=20,r=20), R3(off=0,r=20) and R3 can be used to access. + * ... the code further went all the way to bpf_exit. + * Now the 'cur' state at the mark 'here' has R4(off=30,r=0). + * old_R4(off=20,r=0) equal to cur_R4(off=30,r=0), since if the verifier + * goes further, such cur_R4 will give larger safe packet range after + * 'if (R4 > data_end)' and all further insn were already good with r=20, + * so they will be good with r=30 and we can prune the search. + */ + if (old->off <= cur->off && + old->off >= old->range && cur->off >= cur->range) + return true; + + return false; +} + /* compare two verifier states * * all states stored in state_list are known to be valid, since @@ -1608,17 +2117,25 @@ err_free: */ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) { + struct reg_state *rold, *rcur; int i; for (i = 0; i < MAX_BPF_REG; i++) { - if (memcmp(&old->regs[i], &cur->regs[i], - sizeof(old->regs[0])) != 0) { - if (old->regs[i].type == NOT_INIT || - (old->regs[i].type == UNKNOWN_VALUE && - cur->regs[i].type != NOT_INIT)) - continue; - return false; - } + rold = &old->regs[i]; + rcur = &cur->regs[i]; + + if (memcmp(rold, rcur, sizeof(*rold)) == 0) + continue; + + if (rold->type == NOT_INIT || + (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) + continue; + + if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && + compare_ptrs_to_packet(rold, rcur)) + continue; + + return false; } for (i = 0; i < MAX_BPF_STACK; i++) { @@ -1717,7 +2234,7 @@ static int do_check(struct verifier_env *env) insn = &insns[insn_idx]; class = BPF_CLASS(insn->code); - if (++insn_processed > 32768) { + if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { verbose("BPF program is too large. Proccessed %d insn\n", insn_processed); return -E2BIG; @@ -1740,7 +2257,7 @@ static int do_check(struct verifier_env *env) if (log_level && do_print_state) { verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(env); + print_verifier_state(&env->cur_state); do_print_state = false; } @@ -1952,6 +2469,7 @@ process_bpf_exit: insn_idx++; } + verbose("processed %d insns\n", insn_processed); return 0; } @@ -2003,7 +2521,6 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) if (IS_ERR(map)) { verbose("fd %d is not pointing to valid bpf_map\n", insn->imm); - fdput(f); return PTR_ERR(map); } @@ -2023,15 +2540,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) return -E2BIG; } - /* remember this map */ - env->used_maps[env->used_map_cnt++] = map; - /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded * and all maps are released in free_bpf_prog_info() */ - bpf_map_inc(map, false); + map = bpf_map_inc(map, false); + if (IS_ERR(map)) { + fdput(f); + return PTR_ERR(map); + } + env->used_maps[env->used_map_cnt++] = map; + fdput(f); next_insn: insn++; @@ -2067,26 +2587,6 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) insn->src_reg = 0; } -static void adjust_branches(struct bpf_prog *prog, int pos, int delta) -{ - struct bpf_insn *insn = prog->insnsi; - int insn_cnt = prog->len; - int i; - - for (i = 0; i < insn_cnt; i++, insn++) { - if (BPF_CLASS(insn->code) != BPF_JMP || - BPF_OP(insn->code) == BPF_CALL || - BPF_OP(insn->code) == BPF_EXIT) - continue; - - /* adjust offset of jmps if necessary */ - if (i < pos && i + insn->off + 1 > pos) - insn->off += delta; - else if (i > pos + delta && i + insn->off + 1 <= pos + delta) - insn->off -= delta; - } -} - /* convert load instructions that access fields of 'struct __sk_buff' * into sequence of instructions that access fields of 'struct sk_buff' */ @@ -2096,14 +2596,15 @@ static int convert_ctx_accesses(struct verifier_env *env) int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; - u32 cnt; - int i; enum bpf_access_type type; + int i; if (!env->prog->aux->ops->convert_ctx_access) return 0; for (i = 0; i < insn_cnt; i++, insn++) { + u32 insn_delta, cnt; + if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) type = BPF_READ; else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) @@ -2125,34 +2626,18 @@ static int convert_ctx_accesses(struct verifier_env *env) return -EINVAL; } - if (cnt == 1) { - memcpy(insn, insn_buf, sizeof(*insn)); - continue; - } - - /* several new insns need to be inserted. Make room for them */ - insn_cnt += cnt - 1; - new_prog = bpf_prog_realloc(env->prog, - bpf_prog_size(insn_cnt), - GFP_USER); + new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt); if (!new_prog) return -ENOMEM; - new_prog->len = insn_cnt; - - memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, - sizeof(*insn) * (insn_cnt - i - cnt)); - - /* copy substitute insns in place of load instruction */ - memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); - - /* adjust branches in the whole program */ - adjust_branches(new_prog, i, cnt - 1); + insn_delta = cnt - 1; /* keep walking new program and skip insns we just inserted */ env->prog = new_prog; - insn = new_prog->insnsi + i + cnt - 1; - i += cnt - 1; + insn = new_prog->insnsi + i + insn_delta; + + insn_cnt += insn_delta; + i += insn_delta; } return 0; |