summaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/Kconfig2
-rw-r--r--kernel/bpf/arraymap.c21
-rw-r--r--kernel/bpf/bpf_iter.c22
-rw-r--r--kernel/bpf/btf.c82
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/cpumap.c116
-rw-r--r--kernel/bpf/devmap.c118
-rw-r--r--kernel/bpf/hashtab.c105
-rw-r--r--kernel/bpf/helpers.c356
-rw-r--r--kernel/bpf/local_storage.c20
-rw-r--r--kernel/bpf/map_in_map.c8
-rw-r--r--kernel/bpf/syscall.c53
-rw-r--r--kernel/bpf/trampoline.c12
-rw-r--r--kernel/bpf/verifier.c379
14 files changed, 1177 insertions, 119 deletions
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index bd04f4a44c01..a82d6de86522 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -29,7 +29,7 @@ config BPF_SYSCALL
select IRQ_WORK
select TASKS_TRACE_RCU
select BINARY_PRINTF
- select NET_SOCK_MSG if INET
+ select NET_SOCK_MSG if NET
default n
help
Enable the bpf() system call that allows to manipulate BPF programs
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3c4105603f9d..cebd4fb06d19 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -287,6 +287,12 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
return 0;
}
+static void check_and_free_timer_in_array(struct bpf_array *arr, void *val)
+{
+ if (unlikely(map_value_has_timer(&arr->map)))
+ bpf_timer_cancel_and_free(val + arr->map.timer_off);
+}
+
/* Called from syscall or from eBPF program */
static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
@@ -321,6 +327,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, val, value, false);
else
copy_map_value(map, val, value);
+ check_and_free_timer_in_array(array, val);
}
return 0;
}
@@ -374,6 +381,19 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
+static void array_map_free_timers(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ if (likely(!map_value_has_timer(map)))
+ return;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_timer_cancel_and_free(array->value + array->elem_size * i +
+ map->timer_off);
+}
+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map)
{
@@ -668,6 +688,7 @@ const struct bpf_map_ops array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
+ .map_release_uref = array_map_free_timers,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 2d4fbdbb194e..2e9d47bb40ff 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -360,6 +360,28 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
return supported;
}
+const struct bpf_func_proto *
+bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_iter_target_info *tinfo;
+ const struct bpf_func_proto *fn = NULL;
+
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (tinfo->btf_id == prog->aux->attach_btf_id) {
+ const struct bpf_iter_reg *reg_info;
+
+ reg_info = tinfo->reg_info;
+ if (reg_info->get_func_proto)
+ fn = reg_info->get_func_proto(func_id, prog);
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+
+ return fn;
+}
+
static void bpf_iter_link_release(struct bpf_link *link)
{
struct bpf_iter_link *iter_link =
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index cb4b72997d9b..c395024610ed 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3046,43 +3046,92 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-/* find 'struct bpf_spin_lock' in map value.
- * return >= 0 offset if found
- * and < 0 in case of error
- */
-int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
+ const char *name, int sz, int align)
{
const struct btf_member *member;
u32 i, off = -ENOENT;
- if (!__btf_type_is_struct(t))
- return -EINVAL;
-
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
if (!__btf_type_is_struct(member_type))
continue;
- if (member_type->size != sizeof(struct bpf_spin_lock))
+ if (member_type->size != sz)
continue;
- if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
- "bpf_spin_lock"))
+ if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
continue;
if (off != -ENOENT)
- /* only one 'struct bpf_spin_lock' is allowed */
+ /* only one such field is allowed */
return -E2BIG;
off = btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
off /= 8;
- if (off % __alignof__(struct bpf_spin_lock))
- /* valid struct bpf_spin_lock will be 4 byte aligned */
+ if (off % align)
return -EINVAL;
}
return off;
}
+static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
+ const char *name, int sz, int align)
+{
+ const struct btf_var_secinfo *vsi;
+ u32 i, off = -ENOENT;
+
+ for_each_vsi(i, t, vsi) {
+ const struct btf_type *var = btf_type_by_id(btf, vsi->type);
+ const struct btf_type *var_type = btf_type_by_id(btf, var->type);
+
+ if (!__btf_type_is_struct(var_type))
+ continue;
+ if (var_type->size != sz)
+ continue;
+ if (vsi->size != sz)
+ continue;
+ if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
+ continue;
+ if (off != -ENOENT)
+ /* only one such field is allowed */
+ return -E2BIG;
+ off = vsi->offset;
+ if (off % align)
+ return -EINVAL;
+ }
+ return off;
+}
+
+static int btf_find_field(const struct btf *btf, const struct btf_type *t,
+ const char *name, int sz, int align)
+{
+
+ if (__btf_type_is_struct(t))
+ return btf_find_struct_field(btf, t, name, sz, align);
+ else if (btf_type_is_datasec(t))
+ return btf_find_datasec_var(btf, t, name, sz, align);
+ return -EINVAL;
+}
+
+/* find 'struct bpf_spin_lock' in map value.
+ * return >= 0 offset if found
+ * and < 0 in case of error
+ */
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+{
+ return btf_find_field(btf, t, "bpf_spin_lock",
+ sizeof(struct bpf_spin_lock),
+ __alignof__(struct bpf_spin_lock));
+}
+
+int btf_find_timer(const struct btf *btf, const struct btf_type *t)
+{
+ return btf_find_field(btf, t, "bpf_timer",
+ sizeof(struct bpf_timer),
+ __alignof__(struct bpf_timer));
+}
+
static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct btf_show *show)
@@ -4776,6 +4825,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
if (ctx_arg_info->offset == off) {
+ if (!ctx_arg_info->btf_id) {
+ bpf_log(log,"invalid btf_id for context argument offset %u\n", off);
+ return false;
+ }
+
info->reg_type = ctx_arg_info->reg_type;
info->btf = btf_vmlinux;
info->btf_id = ctx_arg_info->btf_id;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 0a28a8095d3e..82af6279992d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1564,7 +1564,7 @@ select_insn:
if (unlikely(index >= array->map.max_entries))
goto out;
- if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+ if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
goto out;
tail_call_cnt++;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 480e936c54d0..585b2b77ccc4 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -16,6 +16,7 @@
* netstack, and assigning dedicated CPUs for this stage. This
* basically allows for 10G wirespeed pre-filtering via bpf.
*/
+#include <linux/bitops.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/ptr_ring.h>
@@ -168,6 +169,46 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
}
}
+static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
+ struct list_head *listp,
+ struct xdp_cpumap_stats *stats)
+{
+ struct sk_buff *skb, *tmp;
+ struct xdp_buff xdp;
+ u32 act;
+ int err;
+
+ list_for_each_entry_safe(skb, tmp, listp, list) {
+ act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog);
+ switch (act) {
+ case XDP_PASS:
+ break;
+ case XDP_REDIRECT:
+ skb_list_del_init(skb);
+ err = xdp_do_generic_redirect(skb->dev, skb, &xdp,
+ rcpu->prog);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ stats->drop++;
+ } else {
+ stats->redirect++;
+ }
+ return;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(skb->dev, rcpu->prog, act);
+ fallthrough;
+ case XDP_DROP:
+ skb_list_del_init(skb);
+ kfree_skb(skb);
+ stats->drop++;
+ return;
+ }
+ }
+}
+
static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
void **frames, int n,
struct xdp_cpumap_stats *stats)
@@ -176,11 +217,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
struct xdp_buff xdp;
int i, nframes = 0;
- if (!rcpu->prog)
- return n;
-
- rcu_read_lock_bh();
-
xdp_set_return_frame_no_direct();
xdp.rxq = &rxq;
@@ -227,17 +263,37 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
}
+ xdp_clear_return_frame_no_direct();
+
+ return nframes;
+}
+
+#define CPUMAP_BATCH 8
+
+static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
+ int xdp_n, struct xdp_cpumap_stats *stats,
+ struct list_head *list)
+{
+ int nframes;
+
+ if (!rcpu->prog)
+ return xdp_n;
+
+ rcu_read_lock_bh();
+
+ nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
+
if (stats->redirect)
- xdp_do_flush_map();
+ xdp_do_flush();
- xdp_clear_return_frame_no_direct();
+ if (unlikely(!list_empty(list)))
+ cpu_map_bpf_prog_run_skb(rcpu, list, stats);
rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
return nframes;
}
-#define CPUMAP_BATCH 8
static int cpu_map_kthread_run(void *data)
{
@@ -254,9 +310,9 @@ static int cpu_map_kthread_run(void *data)
struct xdp_cpumap_stats stats = {}; /* zero stats */
unsigned int kmem_alloc_drops = 0, sched = 0;
gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
+ int i, n, m, nframes, xdp_n;
void *frames[CPUMAP_BATCH];
void *skbs[CPUMAP_BATCH];
- int i, n, m, nframes;
LIST_HEAD(list);
/* Release CPU reschedule checks */
@@ -280,9 +336,20 @@ static int cpu_map_kthread_run(void *data)
*/
n = __ptr_ring_consume_batched(rcpu->queue, frames,
CPUMAP_BATCH);
- for (i = 0; i < n; i++) {
+ for (i = 0, xdp_n = 0; i < n; i++) {
void *f = frames[i];
- struct page *page = virt_to_page(f);
+ struct page *page;
+
+ if (unlikely(__ptr_test_bit(0, &f))) {
+ struct sk_buff *skb = f;
+
+ __ptr_clear_bit(0, &skb);
+ list_add_tail(&skb->list, &list);
+ continue;
+ }
+
+ frames[xdp_n++] = f;
+ page = virt_to_page(f);
/* Bring struct page memory area to curr CPU. Read by
* build_skb_around via page_is_pfmemalloc(), and when
@@ -292,7 +359,7 @@ static int cpu_map_kthread_run(void *data)
}
/* Support running another XDP prog on this CPU */
- nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats);
+ nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
if (nframes) {
m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs);
if (unlikely(m == 0)) {
@@ -330,12 +397,6 @@ static int cpu_map_kthread_run(void *data)
return 0;
}
-bool cpu_map_prog_allowed(struct bpf_map *map)
-{
- return map->map_type == BPF_MAP_TYPE_CPUMAP &&
- map->value_size != offsetofend(struct bpf_cpumap_val, qsize);
-}
-
static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
{
struct bpf_prog *prog;
@@ -701,6 +762,25 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
return 0;
}
+int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ __skb_pull(skb, skb->mac_len);
+ skb_set_redirected(skb, false);
+ __ptr_set_bit(0, &skb);
+
+ ret = ptr_ring_produce(rcpu->queue, skb);
+ if (ret < 0)
+ goto trace;
+
+ wake_up_process(rcpu->kthread);
+trace:
+ trace_xdp_cpumap_enqueue(rcpu->map_id, !ret, !!ret, rcpu->cpu);
+ return ret;
+}
+
void __cpu_map_flush(void)
{
struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index fdc20892837c..f02d04540c0c 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -322,16 +322,6 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
return -ENOENT;
}
-bool dev_map_can_have_prog(struct bpf_map *map)
-{
- if ((map->map_type == BPF_MAP_TYPE_DEVMAP ||
- map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) &&
- map->value_size != offsetofend(struct bpf_devmap_val, ifindex))
- return true;
-
- return false;
-}
-
static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
struct xdp_frame **frames, int n,
struct net_device *dev)
@@ -499,6 +489,37 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
return 0;
}
+static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst)
+{
+ struct xdp_txq_info txq = { .dev = dst->dev };
+ struct xdp_buff xdp;
+ u32 act;
+
+ if (!dst->xdp_prog)
+ return XDP_PASS;
+
+ __skb_pull(skb, skb->mac_len);
+ xdp.txq = &txq;
+
+ act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog);
+ switch (act) {
+ case XDP_PASS:
+ __skb_push(skb, skb->mac_len);
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(dst->dev, dst->xdp_prog, act);
+ fallthrough;
+ case XDP_DROP:
+ kfree_skb(skb);
+ break;
+ }
+
+ return act;
+}
+
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
struct net_device *dev_rx)
{
@@ -513,10 +534,9 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
}
-static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp,
- int exclude_ifindex)
+static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp)
{
- if (!obj || obj->dev->ifindex == exclude_ifindex ||
+ if (!obj ||
!obj->dev->netdev_ops->ndo_xdp_xmit)
return false;
@@ -541,17 +561,48 @@ static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
return 0;
}
+static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex)
+{
+ while (num_excluded--) {
+ if (ifindex == excluded[num_excluded])
+ return true;
+ }
+ return false;
+}
+
+/* Get ifindex of each upper device. 'indexes' must be able to hold at
+ * least MAX_NEST_DEV elements.
+ * Returns the number of ifindexes added.
+ */
+static int get_upper_ifindexes(struct net_device *dev, int *indexes)
+{
+ struct net_device *upper;
+ struct list_head *iter;
+ int n = 0;
+
+ netdev_for_each_upper_dev_rcu(dev, upper, iter) {
+ indexes[n++] = upper->ifindex;
+ }
+ return n;
+}
+
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
struct bpf_map *map, bool exclude_ingress)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- int exclude_ifindex = exclude_ingress ? dev_rx->ifindex : 0;
struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ int excluded_devices[1+MAX_NEST_DEV];
struct hlist_head *head;
struct xdp_frame *xdpf;
+ int num_excluded = 0;
unsigned int i;
int err;
+ if (exclude_ingress) {
+ num_excluded = get_upper_ifindexes(dev_rx, excluded_devices);
+ excluded_devices[num_excluded++] = dev_rx->ifindex;
+ }
+
xdpf = xdp_convert_buff_to_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
@@ -560,7 +611,10 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
for (i = 0; i < map->max_entries; i++) {
dst = rcu_dereference_check(dtab->netdev_map[i],
rcu_read_lock_bh_held());
- if (!is_valid_dst(dst, xdp, exclude_ifindex))
+ if (!is_valid_dst(dst, xdp))
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
continue;
/* we only need n-1 clones; last_dst enqueued below */
@@ -580,7 +634,11 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
head = dev_map_index_hash(dtab, i);
hlist_for_each_entry_rcu(dst, head, index_hlist,
lockdep_is_held(&dtab->index_lock)) {
- if (!is_valid_dst(dst, xdp, exclude_ifindex))
+ if (!is_valid_dst(dst, xdp))
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded,
+ dst->dev->ifindex))
continue;
/* we only need n-1 clones; last_dst enqueued below */
@@ -615,6 +673,14 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
err = xdp_ok_fwd_dev(dst->dev, skb->len);
if (unlikely(err))
return err;
+
+ /* Redirect has already succeeded semantically at this point, so we just
+ * return 0 even if packet is dropped. Helper below takes care of
+ * freeing skb.
+ */
+ if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS)
+ return 0;
+
skb->dev = dst->dev;
generic_xdp_tx(skb, xdp_prog);
@@ -646,18 +712,27 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
bool exclude_ingress)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- int exclude_ifindex = exclude_ingress ? dev->ifindex : 0;
struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ int excluded_devices[1+MAX_NEST_DEV];
struct hlist_head *head;
struct hlist_node *next;
+ int num_excluded = 0;
unsigned int i;
int err;
+ if (exclude_ingress) {
+ num_excluded = get_upper_ifindexes(dev, excluded_devices);
+ excluded_devices[num_excluded++] = dev->ifindex;
+ }
+
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
for (i = 0; i < map->max_entries; i++) {
dst = rcu_dereference_check(dtab->netdev_map[i],
rcu_read_lock_bh_held());
- if (!dst || dst->dev->ifindex == exclude_ifindex)
+ if (!dst)
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
continue;
/* we only need n-1 clones; last_dst enqueued below */
@@ -671,12 +746,17 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
return err;
last_dst = dst;
+
}
} else { /* BPF_MAP_TYPE_DEVMAP_HASH */
for (i = 0; i < dtab->n_buckets; i++) {
head = dev_map_index_hash(dtab, i);
hlist_for_each_entry_safe(dst, next, head, index_hlist) {
- if (!dst || dst->dev->ifindex == exclude_ifindex)
+ if (!dst)
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded,
+ dst->dev->ifindex))
continue;
/* we only need n-1 clones; last_dst enqueued below */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 9c011f3a2687..32471ba02708 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -228,6 +228,32 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
}
+static bool htab_has_extra_elems(struct bpf_htab *htab)
+{
+ return !htab_is_percpu(htab) && !htab_is_lru(htab);
+}
+
+static void htab_free_prealloced_timers(struct bpf_htab *htab)
+{
+ u32 num_entries = htab->map.max_entries;
+ int i;
+
+ if (likely(!map_value_has_timer(&htab->map)))
+ return;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+
+ for (i = 0; i < num_entries; i++) {
+ struct htab_elem *elem;
+
+ elem = get_htab_elem(htab, i);
+ bpf_timer_cancel_and_free(elem->key +
+ round_up(htab->map.key_size, 8) +
+ htab->map.timer_off);
+ cond_resched();
+ }
+}
+
static void htab_free_elems(struct bpf_htab *htab)
{
int i;
@@ -265,8 +291,12 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
struct htab_elem *l;
if (node) {
+ u32 key_size = htab->map.key_size;
+
l = container_of(node, struct htab_elem, lru_node);
- memcpy(l->key, key, htab->map.key_size);
+ memcpy(l->key, key, key_size);
+ check_and_init_map_value(&htab->map,
+ l->key + round_up(key_size, 8));
return l;
}
@@ -278,7 +308,7 @@ static int prealloc_init(struct bpf_htab *htab)
u32 num_entries = htab->map.max_entries;
int err = -ENOMEM, i;
- if (!htab_is_percpu(htab) && !htab_is_lru(htab))
+ if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
@@ -695,6 +725,14 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
return insn - insn_buf;
}
+static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
+{
+ if (unlikely(map_value_has_timer(&htab->map)))
+ bpf_timer_cancel_and_free(elem->key +
+ round_up(htab->map.key_size, 8) +
+ htab->map.timer_off);
+}
+
/* It is called from the bpf_lru_list when the LRU needs to delete
* older elements from the htab.
*/
@@ -719,6 +757,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
+ check_and_free_timer(htab, l);
break;
}
@@ -790,6 +829,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
+ check_and_free_timer(htab, l);
kfree(l);
}
@@ -817,6 +857,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
htab_put_fd_value(htab, l);
if (htab_is_prealloc(htab)) {
+ check_and_free_timer(htab, l);
__pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
atomic_dec(&htab->count);
@@ -920,8 +961,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
- check_and_init_map_lock(&htab->map,
- l_new->key + round_up(key_size, 8));
+ check_and_init_map_value(&htab->map,
+ l_new->key + round_up(key_size, 8));
}
memcpy(l_new->key, key, key_size);
@@ -1062,6 +1103,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_del_rcu(&l_old->hash_node);
if (!htab_is_prealloc(htab))
free_htab_elem(htab, l_old);
+ else
+ check_and_free_timer(htab, l_old);
}
ret = 0;
err:
@@ -1069,6 +1112,12 @@ err:
return ret;
}
+static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
+{
+ check_and_free_timer(htab, elem);
+ bpf_lru_push_free(&htab->lru, &elem->lru_node);
+}
+
static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
@@ -1102,7 +1151,8 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
l_new = prealloc_lru_pop(htab, key, hash);
if (!l_new)
return -ENOMEM;
- memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
+ copy_map_value(&htab->map,
+ l_new->key + round_up(map->key_size, 8), value);
ret = htab_lock_bucket(htab, b, hash, &flags);
if (ret)
@@ -1128,9 +1178,9 @@ err:
htab_unlock_bucket(htab, b, hash, flags);
if (ret)
- bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ htab_lru_push_free(htab, l_new);
else if (l_old)
- bpf_lru_push_free(&htab->lru, &l_old->lru_node);
+ htab_lru_push_free(htab, l_old);
return ret;
}
@@ -1339,7 +1389,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
htab_unlock_bucket(htab, b, hash, flags);
if (l)
- bpf_lru_push_free(&htab->lru, &l->lru_node);
+ htab_lru_push_free(htab, l);
return ret;
}
@@ -1359,6 +1409,35 @@ static void delete_all_elements(struct bpf_htab *htab)
}
}
+static void htab_free_malloced_timers(struct bpf_htab *htab)
+{
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_nulls_head *head = select_bucket(htab, i);
+ struct hlist_nulls_node *n;
+ struct htab_elem *l;
+
+ hlist_nulls_for_each_entry(l, n, head, hash_node)
+ check_and_free_timer(htab, l);
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+}
+
+static void htab_map_free_timers(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ if (likely(!map_value_has_timer(&htab->map)))
+ return;
+ if (!htab_is_prealloc(htab))
+ htab_free_malloced_timers(htab);
+ else
+ htab_free_prealloced_timers(htab);
+}
+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map)
{
@@ -1456,7 +1535,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
else
copy_map_value(map, value, l->key +
roundup_key_size);
- check_and_init_map_lock(map, value);
+ check_and_init_map_value(map, value);
}
hlist_nulls_del_rcu(&l->hash_node);
@@ -1467,7 +1546,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
htab_unlock_bucket(htab, b, hash, bflags);
if (is_lru_map && l)
- bpf_lru_push_free(&htab->lru, &l->lru_node);
+ htab_lru_push_free(htab, l);
return ret;
}
@@ -1645,7 +1724,7 @@ again_nocopy:
true);
else
copy_map_value(map, dst_val, value);
- check_and_init_map_lock(map, dst_val);
+ check_and_init_map_value(map, dst_val);
}
if (do_delete) {
hlist_nulls_del_rcu(&l->hash_node);
@@ -1672,7 +1751,7 @@ again_nocopy:
while (node_to_free) {
l = node_to_free;
node_to_free = node_to_free->batch_flink;
- bpf_lru_push_free(&htab->lru, &l->lru_node);
+ htab_lru_push_free(htab, l);
}
next_batch:
@@ -2034,6 +2113,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
+ .map_release_uref = htab_map_free_timers,
.map_lookup_elem = htab_map_lookup_elem,
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
@@ -2055,6 +2135,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
+ .map_release_uref = htab_map_free_timers,
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 55f83ea09dae..04010a5e2ddb 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -289,13 +289,18 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
static DEFINE_PER_CPU(unsigned long, irqsave_flags);
-notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
{
unsigned long flags;
local_irq_save(flags);
__bpf_spin_lock(lock);
__this_cpu_write(irqsave_flags, flags);
+}
+
+notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+{
+ __bpf_spin_lock_irqsave(lock);
return 0;
}
@@ -306,13 +311,18 @@ const struct bpf_func_proto bpf_spin_lock_proto = {
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
};
-notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
{
unsigned long flags;
flags = __this_cpu_read(irqsave_flags);
__bpf_spin_unlock(lock);
local_irq_restore(flags);
+}
+
+notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+{
+ __bpf_spin_unlock_irqrestore(lock);
return 0;
}
@@ -333,9 +343,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
else
lock = dst + map->spin_lock_off;
preempt_disable();
- ____bpf_spin_lock(lock);
+ __bpf_spin_lock_irqsave(lock);
copy_map_value(map, dst, src);
- ____bpf_spin_unlock(lock);
+ __bpf_spin_unlock_irqrestore(lock);
preempt_enable();
}
@@ -393,8 +403,6 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
};
#ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
- bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
{
@@ -403,17 +411,13 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
* verifier checks that its value is correct.
*/
enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
- struct bpf_cgroup_storage *storage = NULL;
+ struct bpf_cgroup_storage *storage;
+ struct bpf_cg_run_ctx *ctx;
void *ptr;
- int i;
- for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) {
- if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
- continue;
-
- storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
- break;
- }
+ /* get current cgroup storage from BPF run context */
+ ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+ storage = ctx->prog_item->cgroup_storage[stype];
if (stype == BPF_CGROUP_STORAGE_SHARED)
ptr = &READ_ONCE(storage->buf)->data[0];
@@ -999,6 +1003,320 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+/* BPF map elements can contain 'struct bpf_timer'.
+ * Such map owns all of its BPF timers.
+ * 'struct bpf_timer' is allocated as part of map element allocation
+ * and it's zero initialized.
+ * That space is used to keep 'struct bpf_timer_kern'.
+ * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
+ * remembers 'struct bpf_map *' pointer it's part of.
+ * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
+ * bpf_timer_start() arms the timer.
+ * If user space reference to a map goes to zero at this point
+ * ops->map_release_uref callback is responsible for cancelling the timers,
+ * freeing their memory, and decrementing prog's refcnts.
+ * bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
+ * Inner maps can contain bpf timers as well. ops->map_release_uref is
+ * freeing the timers when inner map is replaced or deleted by user space.
+ */
+struct bpf_hrtimer {
+ struct hrtimer timer;
+ struct bpf_map *map;
+ struct bpf_prog *prog;
+ void __rcu *callback_fn;
+ void *value;
+};
+
+/* the actual struct hidden inside uapi struct bpf_timer */
+struct bpf_timer_kern {
+ struct bpf_hrtimer *timer;
+ /* bpf_spin_lock is used here instead of spinlock_t to make
+ * sure that it always fits into space resereved by struct bpf_timer
+ * regardless of LOCKDEP and spinlock debug flags.
+ */
+ struct bpf_spin_lock lock;
+} __attribute__((aligned(8)));
+
+static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
+
+static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
+{
+ struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
+ struct bpf_map *map = t->map;
+ void *value = t->value;
+ void *callback_fn;
+ void *key;
+ u32 idx;
+
+ callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
+ if (!callback_fn)
+ goto out;
+
+ /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
+ * cannot be preempted by another bpf_timer_cb() on the same cpu.
+ * Remember the timer this callback is servicing to prevent
+ * deadlock if callback_fn() calls bpf_timer_cancel() or
+ * bpf_map_delete_elem() on the same timer.
+ */
+ this_cpu_write(hrtimer_running, t);
+ if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* compute the key */
+ idx = ((char *)value - array->value) / array->elem_size;
+ key = &idx;
+ } else { /* hash or lru */
+ key = value - round_up(map->key_size, 8);
+ }
+
+ BPF_CAST_CALL(callback_fn)((u64)(long)map, (u64)(long)key,
+ (u64)(long)value, 0, 0);
+ /* The verifier checked that return value is zero. */
+
+ this_cpu_write(hrtimer_running, NULL);
+out:
+ return HRTIMER_NORESTART;
+}
+
+BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map,
+ u64, flags)
+{
+ clockid_t clockid = flags & (MAX_CLOCKS - 1);
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ BUILD_BUG_ON(MAX_CLOCKS != 16);
+ BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer));
+ BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer));
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+
+ if (flags >= MAX_CLOCKS ||
+ /* similar to timerfd except _ALARM variants are not supported */
+ (clockid != CLOCK_MONOTONIC &&
+ clockid != CLOCK_REALTIME &&
+ clockid != CLOCK_BOOTTIME))
+ return -EINVAL;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (t) {
+ ret = -EBUSY;
+ goto out;
+ }
+ if (!atomic64_read(&map->usercnt)) {
+ /* maps with timers must be either held by user space
+ * or pinned in bpffs.
+ */
+ ret = -EPERM;
+ goto out;
+ }
+ /* allocate hrtimer via map_kmalloc to use memcg accounting */
+ t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
+ if (!t) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ t->value = (void *)timer - map->timer_off;
+ t->map = map;
+ t->prog = NULL;
+ rcu_assign_pointer(t->callback_fn, NULL);
+ hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+ t->timer.function = bpf_timer_cb;
+ timer->timer = t;
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_init_proto = {
+ .func = bpf_timer_init,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
+ struct bpf_prog_aux *, aux)
+{
+ struct bpf_prog *prev, *prog = aux->prog;
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!atomic64_read(&t->map->usercnt)) {
+ /* maps with timers must be either held by user space
+ * or pinned in bpffs. Otherwise timer might still be
+ * running even when bpf prog is detached and user space
+ * is gone, since map_release_uref won't ever be called.
+ */
+ ret = -EPERM;
+ goto out;
+ }
+ prev = t->prog;
+ if (prev != prog) {
+ /* Bump prog refcnt once. Every bpf_timer_set_callback()
+ * can pick different callback_fn-s within the same prog.
+ */
+ prog = bpf_prog_inc_not_zero(prog);
+ if (IS_ERR(prog)) {
+ ret = PTR_ERR(prog);
+ goto out;
+ }
+ if (prev)
+ /* Drop prev prog refcnt when swapping with new prog */
+ bpf_prog_put(prev);
+ t->prog = prog;
+ }
+ rcu_assign_pointer(t->callback_fn, callback_fn);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_set_callback_proto = {
+ .func = bpf_timer_set_callback,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_PTR_TO_FUNC,
+};
+
+BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags)
+{
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ if (flags)
+ return -EINVAL;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t || !t->prog) {
+ ret = -EINVAL;
+ goto out;
+ }
+ hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_start_proto = {
+ .func = bpf_timer_start,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static void drop_prog_refcnt(struct bpf_hrtimer *t)
+{
+ struct bpf_prog *prog = t->prog;
+
+ if (prog) {
+ bpf_prog_put(prog);
+ t->prog = NULL;
+ rcu_assign_pointer(t->callback_fn, NULL);
+ }
+}
+
+BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
+{
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (this_cpu_read(hrtimer_running) == t) {
+ /* If bpf callback_fn is trying to bpf_timer_cancel()
+ * its own timer the hrtimer_cancel() will deadlock
+ * since it waits for callback_fn to finish
+ */
+ ret = -EDEADLK;
+ goto out;
+ }
+ drop_prog_refcnt(t);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ /* Cancel the timer and wait for associated callback to finish
+ * if it was running.
+ */
+ ret = ret ?: hrtimer_cancel(&t->timer);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_cancel_proto = {
+ .func = bpf_timer_cancel,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+};
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_timer_cancel_and_free(void *val)
+{
+ struct bpf_timer_kern *timer = val;
+ struct bpf_hrtimer *t;
+
+ /* Performance optimization: read timer->timer without lock first. */
+ if (!READ_ONCE(timer->timer))
+ return;
+
+ __bpf_spin_lock_irqsave(&timer->lock);
+ /* re-read it under lock */
+ t = timer->timer;
+ if (!t)
+ goto out;
+ drop_prog_refcnt(t);
+ /* The subsequent bpf_timer_start/cancel() helpers won't be able to use
+ * this timer, since it won't be initialized.
+ */
+ timer->timer = NULL;
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ if (!t)
+ return;
+ /* Cancel the timer and wait for callback to complete if it was running.
+ * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
+ * right after for both preallocated and non-preallocated maps.
+ * The timer->timer = NULL was already done and no code path can
+ * see address 't' anymore.
+ *
+ * Check that bpf_map_delete/update_elem() wasn't called from timer
+ * callback_fn. In such case don't call hrtimer_cancel() (since it will
+ * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
+ * return -1). Though callback_fn is still running on this cpu it's
+ * safe to do kfree(t) because bpf_timer_cb() read everything it needed
+ * from 't'. The bpf subprog callback_fn won't be able to access 't',
+ * since timer->timer = NULL was already done. The timer will be
+ * effectively cancelled because bpf_timer_cb() will return
+ * HRTIMER_NORESTART.
+ */
+ if (this_cpu_read(hrtimer_running) != t)
+ hrtimer_cancel(&t->timer);
+ kfree(t);
+}
+
const struct bpf_func_proto bpf_get_current_task_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -1065,6 +1383,14 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_per_cpu_ptr_proto;
case BPF_FUNC_this_cpu_ptr:
return &bpf_this_cpu_ptr_proto;
+ case BPF_FUNC_timer_init:
+ return &bpf_timer_init_proto;
+ case BPF_FUNC_timer_set_callback:
+ return &bpf_timer_set_callback_proto;
+ case BPF_FUNC_timer_start:
+ return &bpf_timer_start_proto;
+ case BPF_FUNC_timer_cancel:
+ return &bpf_timer_cancel_proto;
default:
break;
}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index bd11db9774c3..035e9e3a7132 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -1,6 +1,7 @@
//SPDX-License-Identifier: GPL-2.0
#include <linux/bpf-cgroup.h>
#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
#include <linux/btf.h>
#include <linux/bug.h>
#include <linux/filter.h>
@@ -11,9 +12,6 @@
#ifdef CONFIG_CGROUP_BPF
-DEFINE_PER_CPU(struct bpf_cgroup_storage_info,
- bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
-
#include "../cgroup/cgroup-internal.h"
#define LOCAL_STORAGE_CREATE_FLAG_MASK \
@@ -173,7 +171,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
return -ENOMEM;
memcpy(&new->data[0], value, map->value_size);
- check_and_init_map_lock(map, new->data);
+ check_and_init_map_value(map, new->data);
new = xchg(&storage->buf, new);
kfree_rcu(new, rcu);
@@ -286,9 +284,17 @@ enoent:
static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
+ __u32 max_value_size = BPF_LOCAL_STORAGE_MAX_VALUE_SIZE;
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_cgroup_storage_map *map;
+ /* percpu is bound by PCPU_MIN_UNIT_SIZE, non-percu
+ * is the same as other local storages.
+ */
+ if (attr->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+ max_value_size = min_t(__u32, max_value_size,
+ PCPU_MIN_UNIT_SIZE);
+
if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) &&
attr->key_size != sizeof(__u64))
return ERR_PTR(-EINVAL);
@@ -296,7 +302,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
if (attr->value_size == 0)
return ERR_PTR(-EINVAL);
- if (attr->value_size > PAGE_SIZE)
+ if (attr->value_size > max_value_size)
return ERR_PTR(-E2BIG);
if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK ||
@@ -409,7 +415,7 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
struct seq_file *m)
{
- enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+ enum bpf_cgroup_storage_type stype;
struct bpf_cgroup_storage *storage;
int cpu;
@@ -509,7 +515,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
map->numa_node);
if (!storage->buf)
goto enomem;
- check_and_init_map_lock(map, storage->buf->data);
+ check_and_init_map_value(map, storage->buf->data);
} else {
storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp);
if (!storage->percpu_buf)
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 39ab0b68cade..5cd8f5277279 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -3,6 +3,7 @@
*/
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include "map_in_map.h"
@@ -50,6 +51,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta->map_flags = inner_map->map_flags;
inner_map_meta->max_entries = inner_map->max_entries;
inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
+ inner_map_meta->timer_off = inner_map->timer_off;
+ if (inner_map->btf) {
+ btf_get(inner_map->btf);
+ inner_map_meta->btf = inner_map->btf;
+ }
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
@@ -65,6 +71,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
void bpf_map_meta_free(struct bpf_map *map_meta)
{
+ btf_put(map_meta->btf);
kfree(map_meta);
}
@@ -75,6 +82,7 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
return meta0->map_type == meta1->map_type &&
meta0->key_size == meta1->key_size &&
meta0->value_size == meta1->value_size &&
+ meta0->timer_off == meta1->timer_off &&
meta0->map_flags == meta1->map_flags;
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e343f158e556..9a2068e39d23 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -260,8 +260,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, value, ptr, true);
else
copy_map_value(map, value, ptr);
- /* mask lock, since value wasn't zero inited */
- check_and_init_map_lock(map, value);
+ /* mask lock and timer, since value wasn't zero inited */
+ check_and_init_map_value(map, value);
}
rcu_read_unlock();
}
@@ -623,7 +623,8 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
struct bpf_map *map = filp->private_data;
int err;
- if (!map->ops->map_mmap || map_value_has_spin_lock(map))
+ if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
+ map_value_has_timer(map))
return -ENOTSUPP;
if (!(vma->vm_flags & VM_SHARED))
@@ -793,6 +794,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
}
}
+ map->timer_off = btf_find_timer(btf, value_type);
+ if (map_value_has_timer(map)) {
+ if (map->map_flags & BPF_F_RDONLY_PROG)
+ return -EACCES;
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY)
+ return -EOPNOTSUPP;
+ }
+
if (map->ops->map_check_btf)
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
@@ -844,6 +855,7 @@ static int map_create(union bpf_attr *attr)
mutex_init(&map->freeze_mutex);
map->spin_lock_off = -EINVAL;
+ map->timer_off = -EINVAL;
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
* the bpf_prog.o must have BTF to begin with
@@ -1591,7 +1603,8 @@ static int map_freeze(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
+ map_value_has_timer(map)) {
fdput(f);
return -ENOTSUPP;
}
@@ -1699,6 +1712,8 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
{
+ unsigned long flags;
+
/* cBPF to eBPF migrations are currently not in the idr store.
* Offloaded programs are removed from the store when their device
* disappears - even if someone grabs an fd to them they are unusable,
@@ -1708,7 +1723,7 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
return;
if (do_idr_lock)
- spin_lock_bh(&prog_idr_lock);
+ spin_lock_irqsave(&prog_idr_lock, flags);
else
__acquire(&prog_idr_lock);
@@ -1716,7 +1731,7 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
prog->aux->id = 0;
if (do_idr_lock)
- spin_unlock_bh(&prog_idr_lock);
+ spin_unlock_irqrestore(&prog_idr_lock, flags);
else
__release(&prog_idr_lock);
}
@@ -1752,14 +1767,32 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
}
}
+static void bpf_prog_put_deferred(struct work_struct *work)
+{
+ struct bpf_prog_aux *aux;
+ struct bpf_prog *prog;
+
+ aux = container_of(work, struct bpf_prog_aux, work);
+ prog = aux->prog;
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
+ __bpf_prog_put_noref(prog, true);
+}
+
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
- if (atomic64_dec_and_test(&prog->aux->refcnt)) {
- perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
- bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
+ struct bpf_prog_aux *aux = prog->aux;
+
+ if (atomic64_dec_and_test(&aux->refcnt)) {
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
- __bpf_prog_put_noref(prog, true);
+
+ if (in_irq() || irqs_disabled()) {
+ INIT_WORK(&aux->work, bpf_prog_put_deferred);
+ schedule_work(&aux->work);
+ } else {
+ bpf_prog_put_deferred(&aux->work);
+ }
}
}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 28a3630c48ee..b2535acfe9db 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -172,7 +172,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
}
static struct bpf_tramp_progs *
-bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
+bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg)
{
const struct bpf_prog_aux *aux;
struct bpf_tramp_progs *tprogs;
@@ -189,8 +189,10 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
*total += tr->progs_cnt[kind];
progs = tprogs[kind].progs;
- hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist)
+ hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) {
+ *ip_arg |= aux->prog->call_get_func_ip;
*progs++ = aux->prog;
+ }
}
return tprogs;
}
@@ -333,9 +335,10 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
struct bpf_tramp_image *im;
struct bpf_tramp_progs *tprogs;
u32 flags = BPF_TRAMP_F_RESTORE_REGS;
+ bool ip_arg = false;
int err, total;
- tprogs = bpf_trampoline_get_progs(tr, &total);
+ tprogs = bpf_trampoline_get_progs(tr, &total, &ip_arg);
if (IS_ERR(tprogs))
return PTR_ERR(tprogs);
@@ -357,6 +360,9 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+ if (ip_arg)
+ flags |= BPF_TRAMP_F_IP_ARG;
+
err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
&tr->func.model, flags, tprogs,
tr->func.addr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 49f07e2bf23b..9134aedfdb7d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -255,6 +255,7 @@ struct bpf_call_arg_meta {
int mem_size;
u64 msize_max_value;
int ref_obj_id;
+ int map_uid;
int func_id;
struct btf *btf;
u32 btf_id;
@@ -734,6 +735,10 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (state->refs[i].id)
verbose(env, ",%d", state->refs[i].id);
}
+ if (state->in_callback_fn)
+ verbose(env, " cb");
+ if (state->in_async_callback_fn)
+ verbose(env, " async_cb");
verbose(env, "\n");
}
@@ -1135,6 +1140,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
if (map->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
reg->map_ptr = map->inner_map_meta;
+ /* transfer reg's id which is unique for every map_lookup_elem
+ * as UID of the inner map.
+ */
+ reg->map_uid = reg->id;
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@@ -1522,6 +1531,54 @@ static void init_func_state(struct bpf_verifier_env *env,
init_reg_state(env, state);
}
+/* Similar to push_stack(), but for async callbacks */
+static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
+ int insn_idx, int prev_insn_idx,
+ int subprog)
+{
+ struct bpf_verifier_stack_elem *elem;
+ struct bpf_func_state *frame;
+
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+ if (!elem)
+ goto err;
+
+ elem->insn_idx = insn_idx;
+ elem->prev_insn_idx = prev_insn_idx;
+ elem->next = env->head;
+ elem->log_pos = env->log.len_used;
+ env->head = elem;
+ env->stack_size++;
+ if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
+ verbose(env,
+ "The sequence of %d jumps is too complex for async cb.\n",
+ env->stack_size);
+ goto err;
+ }
+ /* Unlike push_stack() do not copy_verifier_state().
+ * The caller state doesn't matter.
+ * This is async callback. It starts in a fresh stack.
+ * Initialize it similar to do_check_common().
+ */
+ elem->st.branches = 1;
+ frame = kzalloc(sizeof(*frame), GFP_KERNEL);
+ if (!frame)
+ goto err;
+ init_func_state(env, frame,
+ BPF_MAIN_FUNC /* callsite */,
+ 0 /* frameno within this callchain */,
+ subprog /* subprog number within this prog */);
+ elem->st.frame[0] = frame;
+ return &elem->st;
+err:
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ /* pop all elements and return */
+ while (!pop_stack(env, NULL, NULL, false));
+ return NULL;
+}
+
+
enum reg_arg_type {
SRC_OP, /* register is used as source operand */
DST_OP, /* register is used as destination operand */
@@ -3217,6 +3274,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
}
+ if (map_value_has_timer(map)) {
+ u32 t = map->timer_off;
+
+ if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&
+ t < reg->umax_value + off + size) {
+ verbose(env, "bpf_timer cannot be accessed directly by load/store\n");
+ return -EACCES;
+ }
+ }
return err;
}
@@ -3619,6 +3685,8 @@ process_func:
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
+ int next_insn;
+
if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
/* remember insn and function to return to */
@@ -3626,13 +3694,22 @@ continue_func:
ret_prog[frame] = idx;
/* find the callee */
- i = i + insn[i].imm + 1;
- idx = find_subprog(env, i);
+ next_insn = i + insn[i].imm + 1;
+ idx = find_subprog(env, next_insn);
if (idx < 0) {
WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- i);
+ next_insn);
return -EFAULT;
}
+ if (subprog[idx].is_async_cb) {
+ if (subprog[idx].has_tail_call) {
+ verbose(env, "verifier bug. subprog has tail_call and async cb\n");
+ return -EFAULT;
+ }
+ /* async callbacks don't increase bpf prog stack size */
+ continue;
+ }
+ i = next_insn;
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
@@ -4634,6 +4711,54 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
return 0;
}
+static int process_timer_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ bool is_const = tnum_is_const(reg->var_off);
+ struct bpf_map *map = reg->map_ptr;
+ u64 val = reg->var_off.value;
+
+ if (!is_const) {
+ verbose(env,
+ "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+ if (!map->btf) {
+ verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (!map_value_has_timer(map)) {
+ if (map->timer_off == -E2BIG)
+ verbose(env,
+ "map '%s' has more than one 'struct bpf_timer'\n",
+ map->name);
+ else if (map->timer_off == -ENOENT)
+ verbose(env,
+ "map '%s' doesn't have 'struct bpf_timer'\n",
+ map->name);
+ else
+ verbose(env,
+ "map '%s' is not a struct type or bpf_timer is mangled\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (map->timer_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
+ val + reg->off, map->timer_off);
+ return -EINVAL;
+ }
+ if (meta->map_ptr) {
+ verbose(env, "verifier bug. Two map pointers in a timer helper\n");
+ return -EFAULT;
+ }
+ meta->map_uid = reg->map_uid;
+ meta->map_ptr = map;
+ return 0;
+}
+
static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
{
return type == ARG_PTR_TO_MEM ||
@@ -4766,6 +4891,7 @@ static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PER
static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
@@ -4797,6 +4923,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_FUNC] = &func_ptr_types,
[ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types,
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
+ [ARG_PTR_TO_TIMER] = &timer_types,
};
static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -4926,7 +5053,29 @@ skip_type_check:
if (arg_type == ARG_CONST_MAP_PTR) {
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+ if (meta->map_ptr) {
+ /* Use map_uid (which is unique id of inner map) to reject:
+ * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
+ * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
+ * if (inner_map1 && inner_map2) {
+ * timer = bpf_map_lookup_elem(inner_map1);
+ * if (timer)
+ * // mismatch would have been allowed
+ * bpf_timer_init(timer, inner_map2);
+ * }
+ *
+ * Comparing map_ptr is enough to distinguish normal and outer maps.
+ */
+ if (meta->map_ptr != reg->map_ptr ||
+ meta->map_uid != reg->map_uid) {
+ verbose(env,
+ "timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
+ meta->map_uid, reg->map_uid);
+ return -EINVAL;
+ }
+ }
meta->map_ptr = reg->map_ptr;
+ meta->map_uid = reg->map_uid;
} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
@@ -4978,6 +5127,9 @@ skip_type_check:
verbose(env, "verifier internal error\n");
return -EFAULT;
}
+ } else if (arg_type == ARG_PTR_TO_TIMER) {
+ if (process_timer_func(env, regno, meta))
+ return -EACCES;
} else if (arg_type == ARG_PTR_TO_FUNC) {
meta->subprogno = reg->subprogno;
} else if (arg_type_is_mem_ptr(arg_type)) {
@@ -5597,6 +5749,31 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
}
+ if (insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->imm == BPF_FUNC_timer_set_callback) {
+ struct bpf_verifier_state *async_cb;
+
+ /* there is no real recursion here. timer callbacks are async */
+ env->subprog_info[subprog].is_async_cb = true;
+ async_cb = push_async_cb(env, env->subprog_info[subprog].start,
+ *insn_idx, subprog);
+ if (!async_cb)
+ return -EFAULT;
+ callee = async_cb->frame[0];
+ callee->async_entry_cnt = caller->async_entry_cnt + 1;
+
+ /* Convert bpf_timer_set_callback() args into timer callback args */
+ err = set_callee_state_cb(env, caller, callee, *insn_idx);
+ if (err)
+ return err;
+
+ clear_caller_saved_regs(env, caller->regs);
+ mark_reg_unknown(env, caller->regs, BPF_REG_0);
+ caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+ /* continue with next insn after call */
+ return 0;
+ }
+
callee = kzalloc(sizeof(*callee), GFP_KERNEL);
if (!callee)
return -ENOMEM;
@@ -5724,6 +5901,35 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
return 0;
}
+static int set_timer_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ struct bpf_map *map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ /* bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn);
+ * callback_fn(struct bpf_map *map, void *key, void *value);
+ */
+ callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+ callee->regs[BPF_REG_1].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ callee->regs[BPF_REG_3].map_ptr = map_ptr;
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_async_callback_fn = true;
+ return 0;
+}
+
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
@@ -5937,6 +6143,29 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
return err;
}
+static int check_get_func_ip(struct bpf_verifier_env *env)
+{
+ enum bpf_attach_type eatype = env->prog->expected_attach_type;
+ enum bpf_prog_type type = resolve_prog_type(env->prog);
+ int func_id = BPF_FUNC_get_func_ip;
+
+ if (type == BPF_PROG_TYPE_TRACING) {
+ if (eatype != BPF_TRACE_FENTRY && eatype != BPF_TRACE_FEXIT &&
+ eatype != BPF_MODIFY_RETURN) {
+ verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
+ func_id_name(func_id), func_id);
+ return -ENOTSUPP;
+ }
+ return 0;
+ } else if (type == BPF_PROG_TYPE_KPROBE) {
+ return 0;
+ }
+
+ verbose(env, "func %s#%d not supported for program type %d\n",
+ func_id_name(func_id), func_id, type);
+ return -ENOTSUPP;
+}
+
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
@@ -6051,6 +6280,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
+ if (func_id == BPF_FUNC_timer_set_callback) {
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_timer_callback_state);
+ if (err < 0)
+ return -EINVAL;
+ }
+
if (func_id == BPF_FUNC_snprintf) {
err = check_bpf_snprintf_call(env, regs);
if (err < 0)
@@ -6086,6 +6322,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
+ regs[BPF_REG_0].map_uid = meta.map_uid;
if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
if (map_value_has_spin_lock(meta.map_ptr))
@@ -6207,6 +6444,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
env->prog->call_get_stack = true;
+ if (func_id == BPF_FUNC_get_func_ip) {
+ if (check_get_func_ip(env))
+ return -ENOTSUPP;
+ env->prog->call_get_func_ip = true;
+ }
+
if (changes_data)
clear_all_pkt_pointers(env);
return 0;
@@ -9087,7 +9330,8 @@ static int check_return_code(struct bpf_verifier_env *env)
struct tnum range = tnum_range(0, 1);
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
int err;
- const bool is_subprog = env->cur_state->frame[0]->subprogno;
+ struct bpf_func_state *frame = env->cur_state->frame[0];
+ const bool is_subprog = frame->subprogno;
/* LSM and struct_ops func-ptr's return type could be "void" */
if (!is_subprog &&
@@ -9112,6 +9356,22 @@ static int check_return_code(struct bpf_verifier_env *env)
}
reg = cur_regs(env) + BPF_REG_0;
+
+ if (frame->in_async_callback_fn) {
+ /* enforce return zero from async callbacks like timer */
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "In async callback the register R0 is not a known value (%s)\n",
+ reg_type_str[reg->type]);
+ return -EINVAL;
+ }
+
+ if (!tnum_in(tnum_const(0), reg->var_off)) {
+ verbose_invalid_scalar(env, reg, &range, "async callback", "R0");
+ return -EINVAL;
+ }
+ return 0;
+ }
+
if (is_subprog) {
if (reg->type != SCALAR_VALUE) {
verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
@@ -9326,8 +9586,12 @@ static int visit_func_call_insn(int t, int insn_cnt,
init_explored_state(env, t + 1);
if (visit_callee) {
init_explored_state(env, t);
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
- env, false);
+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
+ /* It's ok to allow recursion from CFG point of
+ * view. __check_func_call() will do the actual
+ * check.
+ */
+ bpf_pseudo_func(insns + t));
}
return ret;
}
@@ -9355,6 +9619,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
return DONE_EXPLORING;
case BPF_CALL:
+ if (insns[t].imm == BPF_FUNC_timer_set_callback)
+ /* Mark this call insn to trigger is_state_visited() check
+ * before call itself is processed by __check_func_call().
+ * Otherwise new async state will be pushed for further
+ * exploration.
+ */
+ init_explored_state(env, t);
return visit_func_call_insn(t, insn_cnt, insns, env,
insns[t].src_reg == BPF_PSEUDO_CALL);
@@ -10363,9 +10634,25 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
states_cnt++;
if (sl->state.insn_idx != insn_idx)
goto next;
+
if (sl->state.branches) {
- if (states_maybe_looping(&sl->state, cur) &&
- states_equal(env, &sl->state, cur)) {
+ struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
+
+ if (frame->in_async_callback_fn &&
+ frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) {
+ /* Different async_entry_cnt means that the verifier is
+ * processing another entry into async callback.
+ * Seeing the same state is not an indication of infinite
+ * loop or infinite recursion.
+ * But finding the same state doesn't mean that it's safe
+ * to stop processing the current state. The previous state
+ * hasn't yet reached bpf_exit, since state.branches > 0.
+ * Checking in_async_callback_fn alone is not enough either.
+ * Since the verifier still needs to catch infinite loops
+ * inside async callbacks.
+ */
+ } else if (states_maybe_looping(&sl->state, cur) &&
+ states_equal(env, &sl->state, cur)) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
return -EINVAL;
@@ -11414,10 +11701,11 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
* insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
* [0, off) and [off, end) to new locations, so the patched range stays zero
*/
-static int adjust_insn_aux_data(struct bpf_verifier_env *env,
- struct bpf_prog *new_prog, u32 off, u32 cnt)
+static void adjust_insn_aux_data(struct bpf_verifier_env *env,
+ struct bpf_insn_aux_data *new_data,
+ struct bpf_prog *new_prog, u32 off, u32 cnt)
{
- struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+ struct bpf_insn_aux_data *old_data = env->insn_aux_data;
struct bpf_insn *insn = new_prog->insnsi;
u32 old_seen = old_data[off].seen;
u32 prog_len;
@@ -11430,12 +11718,9 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
if (cnt == 1)
- return 0;
+ return;
prog_len = new_prog->len;
- new_data = vzalloc(array_size(prog_len,
- sizeof(struct bpf_insn_aux_data)));
- if (!new_data)
- return -ENOMEM;
+
memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
@@ -11446,7 +11731,6 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
}
env->insn_aux_data = new_data;
vfree(old_data);
- return 0;
}
static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
@@ -11481,6 +11765,14 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
const struct bpf_insn *patch, u32 len)
{
struct bpf_prog *new_prog;
+ struct bpf_insn_aux_data *new_data = NULL;
+
+ if (len > 1) {
+ new_data = vzalloc(array_size(env->prog->len + len - 1,
+ sizeof(struct bpf_insn_aux_data)));
+ if (!new_data)
+ return NULL;
+ }
new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
if (IS_ERR(new_prog)) {
@@ -11488,10 +11780,10 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
verbose(env,
"insn %d cannot be patched due to 16-bit range\n",
env->insn_aux_data[off].orig_idx);
+ vfree(new_data);
return NULL;
}
- if (adjust_insn_aux_data(env, new_prog, off, len))
- return NULL;
+ adjust_insn_aux_data(env, new_data, new_prog, off, len);
adjust_subprog_starts(env, off, len);
adjust_poke_descs(new_prog, off, len);
return new_prog;
@@ -12342,6 +12634,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
bool expect_blinding = bpf_jit_blinding_enabled(prog);
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
const int insn_cnt = prog->len;
@@ -12559,6 +12852,39 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
continue;
}
+ if (insn->imm == BPF_FUNC_timer_set_callback) {
+ /* The verifier will process callback_fn as many times as necessary
+ * with different maps and the register states prepared by
+ * set_timer_callback_state will be accurate.
+ *
+ * The following use case is valid:
+ * map1 is shared by prog1, prog2, prog3.
+ * prog1 calls bpf_timer_init for some map1 elements
+ * prog2 calls bpf_timer_set_callback for some map1 elements.
+ * Those that were not bpf_timer_init-ed will return -EINVAL.
+ * prog3 calls bpf_timer_start for some map1 elements.
+ * Those that were not both bpf_timer_init-ed and
+ * bpf_timer_set_callback-ed will return -EINVAL.
+ */
+ struct bpf_insn ld_addrs[2] = {
+ BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
+ };
+
+ insn_buf[0] = ld_addrs[0];
+ insn_buf[1] = ld_addrs[1];
+ insn_buf[2] = *insn;
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
* and other inlining handlers are currently limited to 64 bit
* only.
@@ -12675,6 +13001,21 @@ patch_map_ops_generic:
continue;
}
+ /* Implement bpf_get_func_ip inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_ip) {
+ /* Load IP address from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
patch_call_imm:
fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed