From 96eabe7a40aa17e613cf3db2c742ee8b1fc764d0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 18 Aug 2017 11:28:00 -0700 Subject: bpf: Allow selecting numa node during map creation The current map creation API does not allow to provide the numa-node preference. The memory usually comes from where the map-creation-process is running. The performance is not ideal if the bpf_prog is known to always run in a numa node different from the map-creation-process. One of the use case is sharding on CPU to different LRU maps (i.e. an array of LRU maps). Here is the test result of map_perf_test on the INNER_LRU_HASH_PREALLOC test if we force the lru map used by CPU0 to be allocated from a remote numa node: [ The machine has 20 cores. CPU0-9 at node 0. CPU10-19 at node 1 ] ># taskset -c 10 ./map_perf_test 512 8 1260000 8000000 5:inner_lru_hash_map_perf pre-alloc 1628380 events per sec 4:inner_lru_hash_map_perf pre-alloc 1626396 events per sec 3:inner_lru_hash_map_perf pre-alloc 1626144 events per sec 6:inner_lru_hash_map_perf pre-alloc 1621657 events per sec 2:inner_lru_hash_map_perf pre-alloc 1621534 events per sec 1:inner_lru_hash_map_perf pre-alloc 1620292 events per sec 7:inner_lru_hash_map_perf pre-alloc 1613305 events per sec 0:inner_lru_hash_map_perf pre-alloc 1239150 events per sec #<<< After specifying numa node: ># taskset -c 10 ./map_perf_test 512 8 1260000 8000000 5:inner_lru_hash_map_perf pre-alloc 1629627 events per sec 3:inner_lru_hash_map_perf pre-alloc 1628057 events per sec 1:inner_lru_hash_map_perf pre-alloc 1623054 events per sec 6:inner_lru_hash_map_perf pre-alloc 1616033 events per sec 2:inner_lru_hash_map_perf pre-alloc 1614630 events per sec 4:inner_lru_hash_map_perf pre-alloc 1612651 events per sec 7:inner_lru_hash_map_perf pre-alloc 1609337 events per sec 0:inner_lru_hash_map_perf pre-alloc 1619340 events per sec #<<< This patch adds one field, numa_node, to the bpf_attr. Since numa node 0 is a valid node, a new flag BPF_F_NUMA_NODE is also added. The numa_node field is honored if and only if the BPF_F_NUMA_NODE flag is set. Numa node selection is not supported for percpu map. This patch does not change all the kmalloc. F.e. 'htab = kzalloc()' is not changed since the object is small enough to stay in the cache. Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/bpf/arraymap.c') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index d771a3872500..96e9c5c1dfc9 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -49,13 +49,15 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + int numa_node = bpf_map_attr_numa_node(attr); struct bpf_array *array; u64 array_size; u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0 || attr->map_flags) + attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE || + (percpu && numa_node != NUMA_NO_NODE)) return ERR_PTR(-EINVAL); if (attr->value_size > KMALLOC_MAX_SIZE) @@ -77,7 +79,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); /* allocate all map elements and zero-initialize them */ - array = bpf_map_area_alloc(array_size); + array = bpf_map_area_alloc(array_size, numa_node); if (!array) return ERR_PTR(-ENOMEM); @@ -87,6 +89,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; array->map.map_flags = attr->map_flags; + array->map.numa_node = numa_node; array->elem_size = elem_size; if (!percpu) -- cgit v1.2.3 From 7b0c2a0508b90fce79d3782b2e55d0e8bf6a283e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 19 Aug 2017 03:12:46 +0200 Subject: bpf: inline map in map lookup functions for array and htab Avoid two successive functions calls for the map in map lookup, first is the bpf_map_lookup_elem() helper call, and second the callback via map->ops->map_lookup_elem() to get to the map in map implementation. Implementation inlines array and htab flavor for map in map lookups. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 26 ++++++++++++++++++++++++++ kernel/bpf/hashtab.c | 17 +++++++++++++++++ 2 files changed, 43 insertions(+) (limited to 'kernel/bpf/arraymap.c') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 96e9c5c1dfc9..98c0f00c3f5e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -606,6 +606,31 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } +static u32 array_of_map_gen_lookup(struct bpf_map *map, + struct bpf_insn *insn_buf) +{ + u32 elem_size = round_up(map->value_size, 8); + struct bpf_insn *insn = insn_buf; + const int ret = BPF_REG_0; + const int map_ptr = BPF_REG_1; + const int index = BPF_REG_2; + + *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); + *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + if (is_power_of_2(elem_size)) + *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); + else + *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); + *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); + *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(ret, 0); + + return insn - insn_buf; +} + const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, @@ -615,4 +640,5 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, + .map_gen_lookup = array_of_map_gen_lookup, }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 47ae748c3a49..ae822de4a90a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1322,6 +1322,22 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } +static u32 htab_of_map_gen_lookup(struct bpf_map *map, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + const int ret = BPF_REG_0; + + *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); + *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, + offsetof(struct htab_elem, key) + + round_up(map->key_size, 8)); + *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); + + return insn - insn_buf; +} + static void htab_of_map_free(struct bpf_map *map) { bpf_map_meta_free(map->inner_map_meta); @@ -1337,4 +1353,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, + .map_gen_lookup = htab_of_map_gen_lookup, }; -- cgit v1.2.3 From 97562633bcbac4a07d605ae628d7655fa71caaf5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:19 -0700 Subject: bpf: perf event change needed for subsequent bpf helpers This patch does not impact existing functionalities. It contains the changes in perf event area needed for subsequent bpf_perf_event_read_value and bpf_perf_prog_read_value helpers. Signed-off-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Signed-off-by: David S. Miller --- include/linux/perf_event.h | 7 +++++-- kernel/bpf/arraymap.c | 2 +- kernel/events/core.c | 15 +++++++++++++-- kernel/trace/bpf_trace.c | 2 +- 4 files changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel/bpf/arraymap.c') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8e22f24ded6a..79b18a20cf5d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -806,6 +806,7 @@ struct perf_output_handle { struct bpf_perf_event_data_kern { struct pt_regs *regs; struct perf_sample_data *data; + struct perf_event *event; }; #ifdef CONFIG_CGROUP_PERF @@ -884,7 +885,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); -int perf_event_read_local(struct perf_event *event, u64 *value); +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); @@ -1286,7 +1288,8 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event * { return ERR_PTR(-EINVAL); } -static inline int perf_event_read_local(struct perf_event *event, u64 *value) +static inline int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { return -EINVAL; } diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 98c0f00c3f5e..68d866628be0 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; - if (perf_event_read_local(event, &value) == -EOPNOTSUPP) + if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) goto err_out; ee = bpf_event_entry_gen(perf_file, map_file); diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..902149f05381 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event) * will not be local and we cannot read them atomically * - must not have a pmu::count method */ -int perf_event_read_local(struct perf_event *event, u64 *value) +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { unsigned long flags; int ret = 0; + u64 now; /* * Disabling interrupts avoids all counter scheduling (context @@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } + now = event->shadow_ctx_time + perf_clock(); + if (enabled) + *enabled = now - event->tstamp_enabled; /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) + if (event->oncpu == smp_processor_id()) { event->pmu->read(event); + if (running) + *running = now - event->tstamp_running; + } else if (running) { + *running = event->total_time_running; + } *value = local64_read(&event->count); out: @@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event, struct bpf_perf_event_data_kern ctx = { .data = data, .regs = regs, + .event = event, }; int ret = 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index dc498b605d5d..95888ae6c263 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -275,7 +275,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - err = perf_event_read_local(ee->event, &value); + err = perf_event_read_local(ee->event, &value, NULL, NULL); /* * this api is ugly since we miss [-22..-2] range of valid * counter values, but that's uapi -- cgit v1.2.3 From bc6d5031b43a2291de638ab9304320b4cae61689 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 17 Oct 2017 16:55:54 +0200 Subject: bpf: do not test for PCPU_MIN_UNIT_SIZE before percpu allocations PCPU_MIN_UNIT_SIZE is an implementation detail of the percpu allocator. Given we support __GFP_NOWARN now, lets just let the allocation request fail naturally instead. The two call sites from BPF mistakenly assumed __GFP_NOWARN would work, so no changes needed to their actual __alloc_percpu_gfp() calls which use the flag already. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 2 +- kernel/bpf/hashtab.c | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel/bpf/arraymap.c') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 98c0f00c3f5e..e2636737b69b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -98,7 +98,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); if (array_size >= U32_MAX - PAGE_SIZE || - elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { + bpf_array_alloc_percpu(array)) { bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 431126f31ea3..6533f08d1238 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -317,10 +317,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; - if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) - /* make sure the size for pcpu_alloc() is reasonable */ - goto free_htab; - htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8); if (percpu) -- cgit v1.2.3 From 6e71b04a82248ccf13a94b85cbc674a9fefe53f5 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 18 Oct 2017 13:00:22 -0700 Subject: bpf: Add file mode configuration into bpf maps Introduce the map read/write flags to the eBPF syscalls that returns the map fd. The flags is used to set up the file mode when construct a new file descriptor for bpf maps. To not break the backward capability, the f_flags is set to O_RDWR if the flag passed by syscall is 0. Otherwise it should be O_RDONLY or O_WRONLY. When the userspace want to modify or read the map content, it will check the file mode to see if it is allowed to make the change. Signed-off-by: Chenbo Feng Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 8 +++-- include/uapi/linux/bpf.h | 6 ++++ kernel/bpf/arraymap.c | 6 +++- kernel/bpf/devmap.c | 5 ++- kernel/bpf/hashtab.c | 5 +-- kernel/bpf/inode.c | 15 ++++++--- kernel/bpf/lpm_trie.c | 3 +- kernel/bpf/sockmap.c | 5 ++- kernel/bpf/stackmap.c | 5 ++- kernel/bpf/syscall.c | 88 ++++++++++++++++++++++++++++++++++++++++++------ net/netfilter/xt_bpf.c | 2 +- 11 files changed, 122 insertions(+), 26 deletions(-) (limited to 'kernel/bpf/arraymap.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d67ccdc0099f..3e5508f2fa87 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -315,11 +315,11 @@ void bpf_map_area_free(void *base); extern int sysctl_unprivileged_bpf_disabled; -int bpf_map_new_fd(struct bpf_map *map); +int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); -int bpf_obj_get_user(const char __user *pathname); +int bpf_obj_get_user(const char __user *pathname, int flags); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); @@ -338,6 +338,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); +int bpf_get_file_flag(int flags); + /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. * Best-effort only. No barriers here, since it _will_ race with concurrent @@ -421,7 +423,7 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages) { } -static inline int bpf_obj_get_user(const char __user *pathname) +static inline int bpf_obj_get_user(const char __user *pathname, int flags) { return -EOPNOTSUPP; } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4303fb6c3817..d83f95ea6a1b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -218,6 +218,10 @@ enum bpf_attach_type { #define BPF_OBJ_NAME_LEN 16U +/* Flags for accessing BPF object */ +#define BPF_F_RDONLY (1U << 3) +#define BPF_F_WRONLY (1U << 4) + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -260,6 +264,7 @@ union bpf_attr { struct { /* anonymous struct used by BPF_OBJ_* commands */ __aligned_u64 pathname; __u32 bpf_fd; + __u32 file_flags; }; struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ @@ -287,6 +292,7 @@ union bpf_attr { __u32 map_id; }; __u32 next_id; + __u32 open_flags; }; struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 68d866628be0..988c04c91e10 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -19,6 +19,9 @@ #include "map_in_map.h" +#define ARRAY_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + static void bpf_array_free_percpu(struct bpf_array *array) { int i; @@ -56,7 +59,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE || + attr->value_size == 0 || + attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || (percpu && numa_node != NUMA_NO_NODE)) return ERR_PTR(-EINVAL); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e093d9a2c4dd..e5d3de7cff2e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -50,6 +50,9 @@ #include #include +#define DEV_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + struct bpf_dtab_netdev { struct net_device *dev; struct bpf_dtab *dtab; @@ -80,7 +83,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); dtab = kzalloc(sizeof(*dtab), GFP_USER); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 431126f31ea3..919955236e63 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -18,8 +18,9 @@ #include "bpf_lru_list.h" #include "map_in_map.h" -#define HTAB_CREATE_FLAG_MASK \ - (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE) +#define HTAB_CREATE_FLAG_MASK \ + (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ + BPF_F_RDONLY | BPF_F_WRONLY) struct bucket { struct hlist_nulls_head head; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index be1dde967208..01aaef1a77c5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -295,7 +295,7 @@ out: } static void *bpf_obj_do_get(const struct filename *pathname, - enum bpf_type *type) + enum bpf_type *type, int flags) { struct inode *inode; struct path path; @@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname, return ERR_PTR(ret); inode = d_backing_inode(path.dentry); - ret = inode_permission(inode, MAY_WRITE); + ret = inode_permission(inode, ACC_MODE(flags)); if (ret) goto out; @@ -326,18 +326,23 @@ out: return ERR_PTR(ret); } -int bpf_obj_get_user(const char __user *pathname) +int bpf_obj_get_user(const char __user *pathname, int flags) { enum bpf_type type = BPF_TYPE_UNSPEC; struct filename *pname; int ret = -ENOENT; + int f_flags; void *raw; + f_flags = bpf_get_file_flag(flags); + if (f_flags < 0) + return f_flags; + pname = getname(pathname); if (IS_ERR(pname)) return PTR_ERR(pname); - raw = bpf_obj_do_get(pname, &type); + raw = bpf_obj_do_get(pname, &type, f_flags); if (IS_ERR(raw)) { ret = PTR_ERR(raw); goto out; @@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname) if (type == BPF_TYPE_PROG) ret = bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) - ret = bpf_map_new_fd(raw); + ret = bpf_map_new_fd(raw, f_flags); else goto out; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 34d8a690ea05..885e45479680 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -495,7 +495,8 @@ out: #define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) -#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE) +#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \ + BPF_F_RDONLY | BPF_F_WRONLY) static struct bpf_map *trie_alloc(union bpf_attr *attr) { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index a298d6666698..86ec846f2d5e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -40,6 +40,9 @@ #include #include +#define SOCK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + struct bpf_stab { struct bpf_map map; struct sock **sock_map; @@ -489,7 +492,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->value_size > KMALLOC_MAX_SIZE) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 135be433e9a0..a15bc636cc98 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -11,6 +11,9 @@ #include #include "percpu_freelist.h" +#define STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; @@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (attr->map_flags & ~BPF_F_NUMA_NODE) + if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); /* check sanity of attributes */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0e893cac6795..676a06e6b322 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -34,6 +34,8 @@ #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) +#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) + DEFINE_PER_CPU(int, bpf_prog_active); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); @@ -294,17 +296,48 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) } #endif +static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz, + loff_t *ppos) +{ + /* We need this handler such that alloc_file() enables + * f_mode with FMODE_CAN_READ. + */ + return -EINVAL; +} + +static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, + size_t siz, loff_t *ppos) +{ + /* We need this handler such that alloc_file() enables + * f_mode with FMODE_CAN_WRITE. + */ + return -EINVAL; +} + static const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, #endif .release = bpf_map_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, }; -int bpf_map_new_fd(struct bpf_map *map) +int bpf_map_new_fd(struct bpf_map *map, int flags) { return anon_inode_getfd("bpf-map", &bpf_map_fops, map, - O_RDWR | O_CLOEXEC); + flags | O_CLOEXEC); +} + +int bpf_get_file_flag(int flags) +{ + if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY)) + return -EINVAL; + if (flags & BPF_F_RDONLY) + return O_RDONLY; + if (flags & BPF_F_WRONLY) + return O_WRONLY; + return O_RDWR; } /* helper macro to check that unused fields 'union bpf_attr' are zero */ @@ -344,12 +377,17 @@ static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_map *map; + int f_flags; int err; err = CHECK_ATTR(BPF_MAP_CREATE); if (err) return -EINVAL; + f_flags = bpf_get_file_flag(attr->map_flags); + if (f_flags < 0) + return f_flags; + if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || !node_online(numa_node))) @@ -375,7 +413,7 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map; - err = bpf_map_new_fd(map); + err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd. * bpf_map_put() is needed because the above @@ -490,6 +528,11 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_READ)) { + err = -EPERM; + goto err_put; + } + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -570,6 +613,11 @@ static int map_update_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -659,6 +707,11 @@ static int map_delete_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -702,6 +755,11 @@ static int map_get_next_key(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_READ)) { + err = -EPERM; + goto err_put; + } + if (ukey) { key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { @@ -908,6 +966,8 @@ static const struct file_operations bpf_prog_fops = { .show_fdinfo = bpf_prog_show_fdinfo, #endif .release = bpf_prog_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, }; int bpf_prog_new_fd(struct bpf_prog *prog) @@ -1117,11 +1177,11 @@ free_prog_nouncharge: return err; } -#define BPF_OBJ_LAST_FIELD bpf_fd +#define BPF_OBJ_LAST_FIELD file_flags static int bpf_obj_pin(const union bpf_attr *attr) { - if (CHECK_ATTR(BPF_OBJ)) + if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0) return -EINVAL; return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); @@ -1129,10 +1189,12 @@ static int bpf_obj_pin(const union bpf_attr *attr) static int bpf_obj_get(const union bpf_attr *attr) { - if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) + if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || + attr->file_flags & ~BPF_OBJ_FLAG_MASK) return -EINVAL; - return bpf_obj_get_user(u64_to_user_ptr(attr->pathname)); + return bpf_obj_get_user(u64_to_user_ptr(attr->pathname), + attr->file_flags); } #ifdef CONFIG_CGROUP_BPF @@ -1392,20 +1454,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) return fd; } -#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id +#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags static int bpf_map_get_fd_by_id(const union bpf_attr *attr) { struct bpf_map *map; u32 id = attr->map_id; + int f_flags; int fd; - if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID)) + if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) || + attr->open_flags & ~BPF_OBJ_FLAG_MASK) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + f_flags = bpf_get_file_flag(attr->open_flags); + if (f_flags < 0) + return f_flags; + spin_lock_bh(&map_idr_lock); map = idr_find(&map_idr, id); if (map) @@ -1417,7 +1485,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - fd = bpf_map_new_fd(map); + fd = bpf_map_new_fd(map, f_flags); if (fd < 0) bpf_map_put(map); diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 29123934887b..041da0d9c06f 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -56,7 +56,7 @@ static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) int retval, fd; set_fs(KERNEL_DS); - fd = bpf_obj_get_user(path); + fd = bpf_obj_get_user(path, 0); set_fs(oldfs); if (fd < 0) return fd; -- cgit v1.2.3