summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c8
-rw-r--r--kernel/bpf/cgroup.c2
-rw-r--r--kernel/bpf/core.c20
-rw-r--r--kernel/bpf/cpumap.c702
-rw-r--r--kernel/bpf/devmap.c5
-rw-r--r--kernel/bpf/disasm.c214
-rw-r--r--kernel/bpf/disasm.h32
-rw-r--r--kernel/bpf/hashtab.c5
-rw-r--r--kernel/bpf/inode.c16
-rw-r--r--kernel/bpf/lpm_trie.c3
-rw-r--r--kernel/bpf/sockmap.c5
-rw-r--r--kernel/bpf/stackmap.c5
-rw-r--r--kernel/bpf/syscall.c145
-rw-r--r--kernel/bpf/verifier.c751
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/events/core.c15
-rw-r--r--kernel/smpboot.c25
-rw-r--r--kernel/sysctl.c22
-rw-r--r--kernel/trace/bpf_trace.c88
-rw-r--r--kernel/watchdog.c643
-rw-r--r--kernel/watchdog_hld.c196
22 files changed, 1900 insertions, 1010 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 897daa005b23..e597daae6120 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,8 +2,10 @@ obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
ifeq ($(CONFIG_STREAM_PARSER),y)
obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 98c0f00c3f5e..988c04c91e10 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -19,6 +19,9 @@
#include "map_in_map.h"
+#define ARRAY_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
static void bpf_array_free_percpu(struct bpf_array *array)
{
int i;
@@ -56,7 +59,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+ attr->value_size == 0 ||
+ attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
(percpu && numa_node != NUMA_NO_NODE))
return ERR_PTR(-EINVAL);
@@ -492,7 +496,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
ee = ERR_PTR(-EOPNOTSUPP);
event = perf_file->private_data;
- if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
+ if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
goto err_out;
ee = bpf_event_entry_gen(perf_file, map_file);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index e88abc0865d5..3db5a17fcfe8 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -192,7 +192,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
struct cgroup_subsys_state *css;
struct bpf_prog_list *pl;
bool pl_was_allocated;
- u32 old_flags;
int err;
if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
@@ -239,7 +238,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
pl->prog = prog;
}
- old_flags = cgrp->bpf.flags[type];
cgrp->bpf.flags[type] = flags;
/* allocate and recompute effective prog arrays */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c6be15ae83ee..8e7c8bf2b687 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -309,12 +309,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
{
+ const char *end = sym + KSYM_NAME_LEN;
+
BUILD_BUG_ON(sizeof("bpf_prog_") +
- sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN);
+ sizeof(prog->tag) * 2 +
+ /* name has been null terminated.
+ * We should need +1 for the '_' preceding
+ * the name. However, the null character
+ * is double counted between the name and the
+ * sizeof("bpf_prog_") above, so we omit
+ * the +1 here.
+ */
+ sizeof(prog->aux->name) > KSYM_NAME_LEN);
sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
- *sym = 0;
+ if (prog->aux->name[0])
+ snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
+ else
+ *sym = 0;
}
static __always_inline unsigned long
@@ -1567,5 +1580,8 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+/* These are only used within the BPF_SYSCALL code */
+#ifdef CONFIG_BPF_SYSCALL
EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
+#endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
new file mode 100644
index 000000000000..b4358d84ddf1
--- /dev/null
+++ b/kernel/bpf/cpumap.c
@@ -0,0 +1,702 @@
+/* bpf/cpumap.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2. See COPYING.
+ */
+
+/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+ * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
+ *
+ * Unlike devmap which redirects XDP frames out another NIC device,
+ * this map type redirects raw XDP frames to another CPU. The remote
+ * CPU will do SKB-allocation and call the normal network stack.
+ *
+ * This is a scalability and isolation mechanism, that allow
+ * separating the early driver network XDP layer, from the rest of the
+ * netstack, and assigning dedicated CPUs for this stage. This
+ * basically allows for 10G wirespeed pre-filtering via bpf.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/ptr_ring.h>
+
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/capability.h>
+#include <trace/events/xdp.h>
+
+#include <linux/netdevice.h> /* netif_receive_skb_core */
+#include <linux/etherdevice.h> /* eth_type_trans */
+
+/* General idea: XDP packets getting XDP redirected to another CPU,
+ * will maximum be stored/queued for one driver ->poll() call. It is
+ * guaranteed that setting flush bit and flush operation happen on
+ * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
+ * which queue in bpf_cpu_map_entry contains packets.
+ */
+
+#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */
+struct xdp_bulk_queue {
+ void *q[CPU_MAP_BULK_SIZE];
+ unsigned int count;
+};
+
+/* Struct for every remote "destination" CPU in map */
+struct bpf_cpu_map_entry {
+ u32 cpu; /* kthread CPU and map index */
+ int map_id; /* Back reference to map */
+ u32 qsize; /* Queue size placeholder for map lookup */
+
+ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */
+ struct xdp_bulk_queue __percpu *bulkq;
+
+ /* Queue with potential multi-producers, and single-consumer kthread */
+ struct ptr_ring *queue;
+ struct task_struct *kthread;
+ struct work_struct kthread_stop_wq;
+
+ atomic_t refcnt; /* Control when this struct can be free'ed */
+ struct rcu_head rcu;
+};
+
+struct bpf_cpu_map {
+ struct bpf_map map;
+ /* Below members specific for map type */
+ struct bpf_cpu_map_entry **cpu_map;
+ unsigned long __percpu *flush_needed;
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_bulk_queue *bq);
+
+static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
+{
+ return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_cpu_map *cmap;
+ int err = -ENOMEM;
+ u64 cost;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ return ERR_PTR(-EINVAL);
+
+ cmap = kzalloc(sizeof(*cmap), GFP_USER);
+ if (!cmap)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ cmap->map.map_type = attr->map_type;
+ cmap->map.key_size = attr->key_size;
+ cmap->map.value_size = attr->value_size;
+ cmap->map.max_entries = attr->max_entries;
+ cmap->map.map_flags = attr->map_flags;
+ cmap->map.numa_node = bpf_map_attr_numa_node(attr);
+
+ /* Pre-limit array size based on NR_CPUS, not final CPU check */
+ if (cmap->map.max_entries > NR_CPUS) {
+ err = -E2BIG;
+ goto free_cmap;
+ }
+
+ /* make sure page count doesn't overflow */
+ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
+ cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_cmap;
+ cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* Notice returns -EPERM on if map size is larger than memlock limit */
+ ret = bpf_map_precharge_memlock(cmap->map.pages);
+ if (ret) {
+ err = ret;
+ goto free_cmap;
+ }
+
+ /* A per cpu bitfield with a bit per possible CPU in map */
+ cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
+ __alignof__(unsigned long));
+ if (!cmap->flush_needed)
+ goto free_cmap;
+
+ /* Alloc array for possible remote "destination" CPUs */
+ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
+ sizeof(struct bpf_cpu_map_entry *),
+ cmap->map.numa_node);
+ if (!cmap->cpu_map)
+ goto free_percpu;
+
+ return &cmap->map;
+free_percpu:
+ free_percpu(cmap->flush_needed);
+free_cmap:
+ kfree(cmap);
+ return ERR_PTR(err);
+}
+
+void __cpu_map_queue_destructor(void *ptr)
+{
+ /* The tear-down procedure should have made sure that queue is
+ * empty. See __cpu_map_entry_replace() and work-queue
+ * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+ * gracefully and warn once.
+ */
+ if (WARN_ON_ONCE(ptr))
+ page_frag_free(ptr);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+ if (atomic_dec_and_test(&rcpu->refcnt)) {
+ /* The queue should be empty at this point */
+ ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
+ kfree(rcpu->queue);
+ kfree(rcpu);
+ }
+}
+
+static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+ atomic_inc(&rcpu->refcnt);
+}
+
+/* called from workqueue, to workaround syscall using preempt_disable */
+static void cpu_map_kthread_stop(struct work_struct *work)
+{
+ struct bpf_cpu_map_entry *rcpu;
+
+ rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
+
+ /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
+ * as it waits until all in-flight call_rcu() callbacks complete.
+ */
+ rcu_barrier();
+
+ /* kthread_stop will wake_up_process and wait for it to complete */
+ kthread_stop(rcpu->kthread);
+}
+
+/* For now, xdp_pkt is a cpumap internal data structure, with info
+ * carried between enqueue to dequeue. It is mapped into the top
+ * headroom of the packet, to avoid allocating separate mem.
+ */
+struct xdp_pkt {
+ void *data;
+ u16 len;
+ u16 headroom;
+ u16 metasize;
+ struct net_device *dev_rx;
+};
+
+/* Convert xdp_buff to xdp_pkt */
+static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
+{
+ struct xdp_pkt *xdp_pkt;
+ int metasize;
+ int headroom;
+
+ /* Assure headroom is available for storing info */
+ headroom = xdp->data - xdp->data_hard_start;
+ metasize = xdp->data - xdp->data_meta;
+ metasize = metasize > 0 ? metasize : 0;
+ if ((headroom - metasize) < sizeof(*xdp_pkt))
+ return NULL;
+
+ /* Store info in top of packet */
+ xdp_pkt = xdp->data_hard_start;
+
+ xdp_pkt->data = xdp->data;
+ xdp_pkt->len = xdp->data_end - xdp->data;
+ xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
+ xdp_pkt->metasize = metasize;
+
+ return xdp_pkt;
+}
+
+struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_pkt *xdp_pkt)
+{
+ unsigned int frame_size;
+ void *pkt_data_start;
+ struct sk_buff *skb;
+
+ /* build_skb need to place skb_shared_info after SKB end, and
+ * also want to know the memory "truesize". Thus, need to
+ * know the memory frame size backing xdp_buff.
+ *
+ * XDP was designed to have PAGE_SIZE frames, but this
+ * assumption is not longer true with ixgbe and i40e. It
+ * would be preferred to set frame_size to 2048 or 4096
+ * depending on the driver.
+ * frame_size = 2048;
+ * frame_len = frame_size - sizeof(*xdp_pkt);
+ *
+ * Instead, with info avail, skb_shared_info in placed after
+ * packet len. This, unfortunately fakes the truesize.
+ * Another disadvantage of this approach, the skb_shared_info
+ * is not at a fixed memory location, with mixed length
+ * packets, which is bad for cache-line hotness.
+ */
+ frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+ skb = build_skb(pkt_data_start, frame_size);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, xdp_pkt->headroom);
+ __skb_put(skb, xdp_pkt->len);
+ if (xdp_pkt->metasize)
+ skb_metadata_set(skb, xdp_pkt->metasize);
+
+ /* Essential SKB info: protocol and skb->dev */
+ skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+
+ /* Optional SKB info, currently missing:
+ * - HW checksum info (skb->ip_summed)
+ * - HW RX hash (skb_set_hash)
+ * - RX ring dev queue index (skb_record_rx_queue)
+ */
+
+ return skb;
+}
+
+static int cpu_map_kthread_run(void *data)
+{
+ struct bpf_cpu_map_entry *rcpu = data;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /* When kthread gives stop order, then rcpu have been disconnected
+ * from map, thus no new packets can enter. Remaining in-flight
+ * per CPU stored packets are flushed to this queue. Wait honoring
+ * kthread_stop signal until queue is empty.
+ */
+ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+ unsigned int processed = 0, drops = 0, sched = 0;
+ struct xdp_pkt *xdp_pkt;
+
+ /* Release CPU reschedule checks */
+ if (__ptr_ring_empty(rcpu->queue)) {
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ sched = 1;
+ } else {
+ sched = cond_resched();
+ }
+ __set_current_state(TASK_RUNNING);
+
+ /* Process packets in rcpu->queue */
+ local_bh_disable();
+ /*
+ * The bpf_cpu_map_entry is single consumer, with this
+ * kthread CPU pinned. Lockless access to ptr_ring
+ * consume side valid as no-resize allowed of queue.
+ */
+ while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+ struct sk_buff *skb;
+ int ret;
+
+ skb = cpu_map_build_skb(rcpu, xdp_pkt);
+ if (!skb) {
+ page_frag_free(xdp_pkt);
+ continue;
+ }
+
+ /* Inject into network stack */
+ ret = netif_receive_skb_core(skb);
+ if (ret == NET_RX_DROP)
+ drops++;
+
+ /* Limit BH-disable period */
+ if (++processed == 8)
+ break;
+ }
+ /* Feedback loop via tracepoint */
+ trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched);
+
+ local_bh_enable(); /* resched point, may call do_softirq() */
+ }
+ __set_current_state(TASK_RUNNING);
+
+ put_cpu_map_entry(rcpu);
+ return 0;
+}
+
+struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
+{
+ gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
+ struct bpf_cpu_map_entry *rcpu;
+ int numa, err;
+
+ /* Have map->numa_node, but choose node of redirect target CPU */
+ numa = cpu_to_node(cpu);
+
+ rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
+ if (!rcpu)
+ return NULL;
+
+ /* Alloc percpu bulkq */
+ rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
+ sizeof(void *), gfp);
+ if (!rcpu->bulkq)
+ goto free_rcu;
+
+ /* Alloc queue */
+ rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
+ if (!rcpu->queue)
+ goto free_bulkq;
+
+ err = ptr_ring_init(rcpu->queue, qsize, gfp);
+ if (err)
+ goto free_queue;
+
+ rcpu->cpu = cpu;
+ rcpu->map_id = map_id;
+ rcpu->qsize = qsize;
+
+ /* Setup kthread */
+ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
+ "cpumap/%d/map:%d", cpu, map_id);
+ if (IS_ERR(rcpu->kthread))
+ goto free_ptr_ring;
+
+ get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
+ get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
+
+ /* Make sure kthread runs on a single CPU */
+ kthread_bind(rcpu->kthread, cpu);
+ wake_up_process(rcpu->kthread);
+
+ return rcpu;
+
+free_ptr_ring:
+ ptr_ring_cleanup(rcpu->queue, NULL);
+free_queue:
+ kfree(rcpu->queue);
+free_bulkq:
+ free_percpu(rcpu->bulkq);
+free_rcu:
+ kfree(rcpu);
+ return NULL;
+}
+
+void __cpu_map_entry_free(struct rcu_head *rcu)
+{
+ struct bpf_cpu_map_entry *rcpu;
+ int cpu;
+
+ /* This cpu_map_entry have been disconnected from map and one
+ * RCU graze-period have elapsed. Thus, XDP cannot queue any
+ * new packets and cannot change/set flush_needed that can
+ * find this entry.
+ */
+ rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+
+ /* Flush remaining packets in percpu bulkq */
+ for_each_online_cpu(cpu) {
+ struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
+
+ /* No concurrent bq_enqueue can run at this point */
+ bq_flush_to_queue(rcpu, bq);
+ }
+ free_percpu(rcpu->bulkq);
+ /* Cannot kthread_stop() here, last put free rcpu resources */
+ put_cpu_map_entry(rcpu);
+}
+
+/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
+ * ensure any driver rcu critical sections have completed, but this
+ * does not guarantee a flush has happened yet. Because driver side
+ * rcu_read_lock/unlock only protects the running XDP program. The
+ * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
+ * pending flush op doesn't fail.
+ *
+ * The bpf_cpu_map_entry is still used by the kthread, and there can
+ * still be pending packets (in queue and percpu bulkq). A refcnt
+ * makes sure to last user (kthread_stop vs. call_rcu) free memory
+ * resources.
+ *
+ * The rcu callback __cpu_map_entry_free flush remaining packets in
+ * percpu bulkq to queue. Due to caller map_delete_elem() disable
+ * preemption, cannot call kthread_stop() to make sure queue is empty.
+ * Instead a work_queue is started for stopping kthread,
+ * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * stopping kthread, emptying the queue.
+ */
+void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+ u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+{
+ struct bpf_cpu_map_entry *old_rcpu;
+
+ old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
+ if (old_rcpu) {
+ call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
+ INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
+ schedule_work(&old_rcpu->kthread_stop_wq);
+ }
+}
+
+int cpu_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ u32 key_cpu = *(u32 *)key;
+
+ if (key_cpu >= map->max_entries)
+ return -EINVAL;
+
+ /* notice caller map_delete_elem() use preempt_disable() */
+ __cpu_map_entry_replace(cmap, key_cpu, NULL);
+ return 0;
+}
+
+int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ struct bpf_cpu_map_entry *rcpu;
+
+ /* Array index key correspond to CPU number */
+ u32 key_cpu = *(u32 *)key;
+ /* Value is the queue size */
+ u32 qsize = *(u32 *)value;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(key_cpu >= cmap->map.max_entries))
+ return -E2BIG;
+ if (unlikely(map_flags == BPF_NOEXIST))
+ return -EEXIST;
+ if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+ return -EOVERFLOW;
+
+ /* Make sure CPU is a valid possible cpu */
+ if (!cpu_possible(key_cpu))
+ return -ENODEV;
+
+ if (qsize == 0) {
+ rcpu = NULL; /* Same as deleting */
+ } else {
+ /* Updating qsize cause re-allocation of bpf_cpu_map_entry */
+ rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+ if (!rcpu)
+ return -ENOMEM;
+ }
+ rcu_read_lock();
+ __cpu_map_entry_replace(cmap, key_cpu, rcpu);
+ rcu_read_unlock();
+ return 0;
+}
+
+void cpu_map_free(struct bpf_map *map)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ int cpu;
+ u32 i;
+
+ /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the bpf programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete. The rcu critical section only guarantees
+ * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
+ * It does __not__ ensure pending flush operations (if any) are
+ * complete.
+ */
+ synchronize_rcu();
+
+ /* To ensure all pending flush operations have completed wait for flush
+ * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+ * Because the above synchronize_rcu() ensures the map is disconnected
+ * from the program we can assume no new bits will be set.
+ */
+ for_each_online_cpu(cpu) {
+ unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+
+ while (!bitmap_empty(bitmap, cmap->map.max_entries))
+ cond_resched();
+ }
+
+ /* For cpu_map the remote CPUs can still be using the entries
+ * (struct bpf_cpu_map_entry).
+ */
+ for (i = 0; i < cmap->map.max_entries; i++) {
+ struct bpf_cpu_map_entry *rcpu;
+
+ rcpu = READ_ONCE(cmap->cpu_map[i]);
+ if (!rcpu)
+ continue;
+
+ /* bq flush and cleanup happens after RCU graze-period */
+ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+ }
+ free_percpu(cmap->flush_needed);
+ bpf_map_area_free(cmap->cpu_map);
+ kfree(cmap);
+}
+
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ struct bpf_cpu_map_entry *rcpu;
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ rcpu = READ_ONCE(cmap->cpu_map[key]);
+ return rcpu;
+}
+
+static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_cpu_map_entry *rcpu =
+ __cpu_map_lookup_elem(map, *(u32 *)key);
+
+ return rcpu ? &rcpu->qsize : NULL;
+}
+
+static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= cmap->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == cmap->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+const struct bpf_map_ops cpu_map_ops = {
+ .map_alloc = cpu_map_alloc,
+ .map_free = cpu_map_free,
+ .map_delete_elem = cpu_map_delete_elem,
+ .map_update_elem = cpu_map_update_elem,
+ .map_lookup_elem = cpu_map_lookup_elem,
+ .map_get_next_key = cpu_map_get_next_key,
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_bulk_queue *bq)
+{
+ unsigned int processed = 0, drops = 0;
+ const int to_cpu = rcpu->cpu;
+ struct ptr_ring *q;
+ int i;
+
+ if (unlikely(!bq->count))
+ return 0;
+
+ q = rcpu->queue;
+ spin_lock(&q->producer_lock);
+
+ for (i = 0; i < bq->count; i++) {
+ void *xdp_pkt = bq->q[i];
+ int err;
+
+ err = __ptr_ring_produce(q, xdp_pkt);
+ if (err) {
+ drops++;
+ page_frag_free(xdp_pkt); /* Free xdp_pkt */
+ }
+ processed++;
+ }
+ bq->count = 0;
+ spin_unlock(&q->producer_lock);
+
+ /* Feedback loop via tracepoints */
+ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
+ return 0;
+}
+
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+{
+ struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+
+ if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
+ bq_flush_to_queue(rcpu, bq);
+
+ /* Notice, xdp_buff/page MUST be queued here, long enough for
+ * driver to code invoking us to finished, due to driver
+ * (e.g. ixgbe) recycle tricks based on page-refcnt.
+ *
+ * Thus, incoming xdp_pkt is always queued here (else we race
+ * with another CPU on page-refcnt and remaining driver code).
+ * Queue time is very short, as driver will invoke flush
+ * operation, when completing napi->poll call.
+ */
+ bq->q[bq->count++] = xdp_pkt;
+ return 0;
+}
+
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ struct xdp_pkt *xdp_pkt;
+
+ xdp_pkt = convert_to_xdp_pkt(xdp);
+ if (!xdp_pkt)
+ return -EOVERFLOW;
+
+ /* Info needed when constructing SKB on remote CPU */
+ xdp_pkt->dev_rx = dev_rx;
+
+ bq_enqueue(rcpu, xdp_pkt);
+ return 0;
+}
+
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+
+ __set_bit(bit, bitmap);
+}
+
+void __cpu_map_flush(struct bpf_map *map)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+ u32 bit;
+
+ /* The napi->poll softirq makes sure __cpu_map_insert_ctx()
+ * and __cpu_map_flush() happen on same CPU. Thus, the percpu
+ * bitmap indicate which percpu bulkq have packets.
+ */
+ for_each_set_bit(bit, bitmap, map->max_entries) {
+ struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
+ struct xdp_bulk_queue *bq;
+
+ /* This is possible if entry is removed by user space
+ * between xdp redirect and flush op.
+ */
+ if (unlikely(!rcpu))
+ continue;
+
+ __clear_bit(bit, bitmap);
+
+ /* Flush all frames in bulkq to real queue */
+ bq = this_cpu_ptr(rcpu->bulkq);
+ bq_flush_to_queue(rcpu, bq);
+
+ /* If already running, costs spin_lock_irqsave + smb_mb */
+ wake_up_process(rcpu->kthread);
+ }
+}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index e093d9a2c4dd..e5d3de7cff2e 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -50,6 +50,9 @@
#include <linux/bpf.h>
#include <linux/filter.h>
+#define DEV_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
struct bpf_dtab_netdev {
struct net_device *dev;
struct bpf_dtab *dtab;
@@ -80,7 +83,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
dtab = kzalloc(sizeof(*dtab), GFP_USER);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
new file mode 100644
index 000000000000..e682850c9715
--- /dev/null
+++ b/kernel/bpf/disasm.c
@@ -0,0 +1,214 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+
+#include "disasm.h"
+
+#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
+static const char * const func_id_str[] = {
+ __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
+};
+#undef __BPF_FUNC_STR_FN
+
+const char *func_id_name(int id)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+
+ if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
+ return func_id_str[id];
+ else
+ return "unknown";
+}
+
+const char *const bpf_class_string[8] = {
+ [BPF_LD] = "ld",
+ [BPF_LDX] = "ldx",
+ [BPF_ST] = "st",
+ [BPF_STX] = "stx",
+ [BPF_ALU] = "alu",
+ [BPF_JMP] = "jmp",
+ [BPF_RET] = "BUG",
+ [BPF_ALU64] = "alu64",
+};
+
+const char *const bpf_alu_string[16] = {
+ [BPF_ADD >> 4] = "+=",
+ [BPF_SUB >> 4] = "-=",
+ [BPF_MUL >> 4] = "*=",
+ [BPF_DIV >> 4] = "/=",
+ [BPF_OR >> 4] = "|=",
+ [BPF_AND >> 4] = "&=",
+ [BPF_LSH >> 4] = "<<=",
+ [BPF_RSH >> 4] = ">>=",
+ [BPF_NEG >> 4] = "neg",
+ [BPF_MOD >> 4] = "%=",
+ [BPF_XOR >> 4] = "^=",
+ [BPF_MOV >> 4] = "=",
+ [BPF_ARSH >> 4] = "s>>=",
+ [BPF_END >> 4] = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+ [BPF_W >> 3] = "u32",
+ [BPF_H >> 3] = "u16",
+ [BPF_B >> 3] = "u8",
+ [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[16] = {
+ [BPF_JA >> 4] = "jmp",
+ [BPF_JEQ >> 4] = "==",
+ [BPF_JGT >> 4] = ">",
+ [BPF_JLT >> 4] = "<",
+ [BPF_JGE >> 4] = ">=",
+ [BPF_JLE >> 4] = "<=",
+ [BPF_JSET >> 4] = "&",
+ [BPF_JNE >> 4] = "!=",
+ [BPF_JSGT >> 4] = "s>",
+ [BPF_JSLT >> 4] = "s<",
+ [BPF_JSGE >> 4] = "s>=",
+ [BPF_JSLE >> 4] = "s<=",
+ [BPF_CALL >> 4] = "call",
+ [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_end_insn(bpf_insn_print_cb verbose,
+ struct bpf_verifier_env *env,
+ const struct bpf_insn *insn)
+{
+ verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+ BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+ insn->imm, insn->dst_reg);
+}
+
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, bool allow_ptr_leaks)
+{
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (BPF_OP(insn->code) == BPF_END) {
+ if (class == BPF_ALU64)
+ verbose(env, "BUG_alu64_%02x\n", insn->code);
+ else
+ print_bpf_end_insn(verbose, env, insn);
+ } else if (BPF_OP(insn->code) == BPF_NEG) {
+ verbose(env, "(%02x) r%d = %s-r%d\n",
+ insn->code, insn->dst_reg,
+ class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose(env, "(%02x) %sr%d %s %sr%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->src_reg);
+ } else {
+ verbose(env, "(%02x) %sr%d %s %s%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->imm);
+ }
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->src_reg);
+ else if (BPF_MODE(insn->code) == BPF_XADD)
+ verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ insn->src_reg);
+ else
+ verbose(env, "BUG_%02x\n", insn->code);
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(env, "BUG_st_%02x\n", insn->code);
+ return;
+ }
+ verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(env, "BUG_ldx_%02x\n", insn->code);
+ return;
+ }
+ verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (class == BPF_LD) {
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IND) {
+ verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM &&
+ BPF_SIZE(insn->code) == BPF_DW) {
+ /* At this point, we already made sure that the second
+ * part of the ldimm64 insn is accessible.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+ bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+ if (map_ptr && !allow_ptr_leaks)
+ imm = 0;
+
+ verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
+ insn->dst_reg, (unsigned long long)imm);
+ } else {
+ verbose(env, "BUG_ld_%02x\n", insn->code);
+ return;
+ }
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ verbose(env, "(%02x) call %s#%d\n", insn->code,
+ func_id_name(insn->imm), insn->imm);
+ } else if (insn->code == (BPF_JMP | BPF_JA)) {
+ verbose(env, "(%02x) goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+ verbose(env, "(%02x) exit\n", insn->code);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->src_reg, insn->off);
+ } else {
+ verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->imm, insn->off);
+ }
+ } else {
+ verbose(env, "(%02x) %s\n",
+ insn->code, bpf_class_string[class]);
+ }
+}
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
new file mode 100644
index 000000000000..8de977e420b6
--- /dev/null
+++ b/kernel/bpf/disasm.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef __BPF_DISASM_H__
+#define __BPF_DISASM_H__
+
+#include <linux/bpf.h>
+#include <linux/kernel.h>
+#include <linux/stringify.h>
+
+extern const char *const bpf_alu_string[16];
+extern const char *const bpf_class_string[8];
+
+const char *func_id_name(int id);
+
+struct bpf_verifier_env;
+typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env,
+ const char *, ...);
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, bool allow_ptr_leaks);
+
+#endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 431126f31ea3..919955236e63 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,8 +18,9 @@
#include "bpf_lru_list.h"
#include "map_in_map.h"
-#define HTAB_CREATE_FLAG_MASK \
- (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+#define HTAB_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
+ BPF_F_RDONLY | BPF_F_WRONLY)
struct bucket {
struct hlist_nulls_head head;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index e833ed914358..01aaef1a77c5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -295,7 +295,7 @@ out:
}
static void *bpf_obj_do_get(const struct filename *pathname,
- enum bpf_type *type)
+ enum bpf_type *type, int flags)
{
struct inode *inode;
struct path path;
@@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
return ERR_PTR(ret);
inode = d_backing_inode(path.dentry);
- ret = inode_permission(inode, MAY_WRITE);
+ ret = inode_permission(inode, ACC_MODE(flags));
if (ret)
goto out;
@@ -326,18 +326,23 @@ out:
return ERR_PTR(ret);
}
-int bpf_obj_get_user(const char __user *pathname)
+int bpf_obj_get_user(const char __user *pathname, int flags)
{
enum bpf_type type = BPF_TYPE_UNSPEC;
struct filename *pname;
int ret = -ENOENT;
+ int f_flags;
void *raw;
+ f_flags = bpf_get_file_flag(flags);
+ if (f_flags < 0)
+ return f_flags;
+
pname = getname(pathname);
if (IS_ERR(pname))
return PTR_ERR(pname);
- raw = bpf_obj_do_get(pname, &type);
+ raw = bpf_obj_do_get(pname, &type, f_flags);
if (IS_ERR(raw)) {
ret = PTR_ERR(raw);
goto out;
@@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname)
if (type == BPF_TYPE_PROG)
ret = bpf_prog_new_fd(raw);
else if (type == BPF_TYPE_MAP)
- ret = bpf_map_new_fd(raw);
+ ret = bpf_map_new_fd(raw, f_flags);
else
goto out;
@@ -363,6 +368,7 @@ out:
putname(pname);
return ret;
}
+EXPORT_SYMBOL_GPL(bpf_obj_get_user);
static void bpf_evict_inode(struct inode *inode)
{
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 34d8a690ea05..885e45479680 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -495,7 +495,8 @@ out:
#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
-#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \
+ BPF_F_RDONLY | BPF_F_WRONLY)
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a298d6666698..86ec846f2d5e 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -40,6 +40,9 @@
#include <linux/list.h>
#include <net/strparser.h>
+#define SOCK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
struct bpf_stab {
struct bpf_map map;
struct sock **sock_map;
@@ -489,7 +492,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
if (attr->value_size > KMALLOC_MAX_SIZE)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 135be433e9a0..a15bc636cc98 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,9 @@
#include <linux/perf_event.h>
#include "percpu_freelist.h"
+#define STACK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
struct stack_map_bucket {
struct pcpu_freelist_node fnode;
u32 hash;
@@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- if (attr->map_flags & ~BPF_F_NUMA_NODE)
+ if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
/* check sanity of attributes */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0048cb24ba7b..323be2473c4b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -34,6 +34,8 @@
#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
+#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
+
DEFINE_PER_CPU(int, bpf_prog_active);
static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
@@ -210,6 +212,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
struct bpf_map *map = container_of(work, struct bpf_map, work);
bpf_map_uncharge_memlock(map);
+ security_bpf_map_free(map);
/* implementation dependent freeing */
map->ops->map_free(map);
}
@@ -294,17 +297,54 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif
-static const struct file_operations bpf_map_fops = {
+static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
+ loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_READ.
+ */
+ return -EINVAL;
+}
+
+static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
+ size_t siz, loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_WRITE.
+ */
+ return -EINVAL;
+}
+
+const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo,
#endif
.release = bpf_map_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
};
-int bpf_map_new_fd(struct bpf_map *map)
+int bpf_map_new_fd(struct bpf_map *map, int flags)
{
+ int ret;
+
+ ret = security_bpf_map(map, OPEN_FMODE(flags));
+ if (ret < 0)
+ return ret;
+
return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
- O_RDWR | O_CLOEXEC);
+ flags | O_CLOEXEC);
+}
+
+int bpf_get_file_flag(int flags)
+{
+ if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
+ return -EINVAL;
+ if (flags & BPF_F_RDONLY)
+ return O_RDONLY;
+ if (flags & BPF_F_WRONLY)
+ return O_WRONLY;
+ return O_RDWR;
}
/* helper macro to check that unused fields 'union bpf_attr' are zero */
@@ -322,6 +362,8 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
{
const char *end = src + BPF_OBJ_NAME_LEN;
+ memset(dst, 0, BPF_OBJ_NAME_LEN);
+
/* Copy all isalnum() and '_' char */
while (src < end && *src) {
if (!isalnum(*src) && *src != '_')
@@ -333,9 +375,6 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
if (src == end)
return -EINVAL;
- /* '\0' terminates dst */
- *dst = 0;
-
return 0;
}
@@ -345,12 +384,17 @@ static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_map *map;
+ int f_flags;
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
if (err)
return -EINVAL;
+ f_flags = bpf_get_file_flag(attr->map_flags);
+ if (f_flags < 0)
+ return f_flags;
+
if (numa_node != NUMA_NO_NODE &&
((unsigned int)numa_node >= nr_node_ids ||
!node_online(numa_node)))
@@ -368,15 +412,19 @@ static int map_create(union bpf_attr *attr)
atomic_set(&map->refcnt, 1);
atomic_set(&map->usercnt, 1);
- err = bpf_map_charge_memlock(map);
+ err = security_bpf_map_alloc(map);
if (err)
goto free_map_nouncharge;
+ err = bpf_map_charge_memlock(map);
+ if (err)
+ goto free_map_sec;
+
err = bpf_map_alloc_id(map);
if (err)
goto free_map;
- err = bpf_map_new_fd(map);
+ err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
/* failed to allocate fd.
* bpf_map_put() is needed because the above
@@ -393,6 +441,8 @@ static int map_create(union bpf_attr *attr)
free_map:
bpf_map_uncharge_memlock(map);
+free_map_sec:
+ security_bpf_map_free(map);
free_map_nouncharge:
map->ops->map_free(map);
return err;
@@ -491,6 +541,11 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -571,6 +626,11 @@ static int map_update_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -593,6 +653,12 @@ static int map_update_elem(union bpf_attr *attr)
if (copy_from_user(value, uvalue, value_size) != 0)
goto free_value;
+ /* Need to create a kthread, thus must support schedule */
+ if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+ err = map->ops->map_update_elem(map, key, value, attr->flags);
+ goto out;
+ }
+
/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
* inside bpf map update or delete otherwise deadlocks are possible
*/
@@ -623,7 +689,7 @@ static int map_update_elem(union bpf_attr *attr)
}
__this_cpu_dec(bpf_prog_active);
preempt_enable();
-
+out:
if (!err)
trace_bpf_map_update_elem(map, ufd, key, value);
free_value:
@@ -654,6 +720,11 @@ static int map_delete_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -697,6 +768,11 @@ static int map_get_next_key(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
if (ukey) {
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
@@ -734,9 +810,9 @@ err_put:
return err;
}
-static const struct bpf_verifier_ops * const bpf_prog_types[] = {
-#define BPF_PROG_TYPE(_id, _ops) \
- [_id] = &_ops,
+static const struct bpf_prog_ops * const bpf_prog_types[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+ [_id] = & _name ## _prog_ops,
#define BPF_MAP_TYPE(_id, _ops)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
@@ -851,6 +927,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
free_used_maps(aux);
bpf_prog_uncharge_memlock(aux->prog);
+ security_bpf_prog_free(aux);
bpf_prog_free(aux->prog);
}
@@ -898,15 +975,23 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif
-static const struct file_operations bpf_prog_fops = {
+const struct file_operations bpf_prog_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_prog_show_fdinfo,
#endif
.release = bpf_prog_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
};
int bpf_prog_new_fd(struct bpf_prog *prog)
{
+ int ret;
+
+ ret = security_bpf_prog(prog);
+ if (ret < 0)
+ return ret;
+
return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
O_RDWR | O_CLOEXEC);
}
@@ -1046,10 +1131,14 @@ static int bpf_prog_load(union bpf_attr *attr)
if (!prog)
return -ENOMEM;
- err = bpf_prog_charge_memlock(prog);
+ err = security_bpf_prog_alloc(prog->aux);
if (err)
goto free_prog_nouncharge;
+ err = bpf_prog_charge_memlock(prog);
+ if (err)
+ goto free_prog_sec;
+
prog->len = attr->insn_cnt;
err = -EFAULT;
@@ -1107,16 +1196,18 @@ free_used_maps:
free_used_maps(prog->aux);
free_prog:
bpf_prog_uncharge_memlock(prog);
+free_prog_sec:
+ security_bpf_prog_free(prog->aux);
free_prog_nouncharge:
bpf_prog_free(prog);
return err;
}
-#define BPF_OBJ_LAST_FIELD bpf_fd
+#define BPF_OBJ_LAST_FIELD file_flags
static int bpf_obj_pin(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ))
+ if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
return -EINVAL;
return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
@@ -1124,10 +1215,12 @@ static int bpf_obj_pin(const union bpf_attr *attr)
static int bpf_obj_get(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+ if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
+ attr->file_flags & ~BPF_OBJ_FLAG_MASK)
return -EINVAL;
- return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
+ return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+ attr->file_flags);
}
#ifdef CONFIG_CGROUP_BPF
@@ -1387,20 +1480,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
return fd;
}
-#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
{
struct bpf_map *map;
u32 id = attr->map_id;
+ int f_flags;
int fd;
- if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+ if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
+ attr->open_flags & ~BPF_OBJ_FLAG_MASK)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ f_flags = bpf_get_file_flag(attr->open_flags);
+ if (f_flags < 0)
+ return f_flags;
+
spin_lock_bh(&map_idr_lock);
map = idr_find(&map_idr, id);
if (map)
@@ -1412,7 +1511,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- fd = bpf_map_new_fd(map);
+ fd = bpf_map_new_fd(map, f_flags);
if (fd < 0)
bpf_map_put(map);
@@ -1567,6 +1666,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;
+ err = security_bpf(cmd, &attr, size);
+ if (err < 0)
+ return err;
+
switch (cmd) {
case BPF_MAP_CREATE:
err = map_create(&attr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 52b022310f6a..545b8c45a578 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21,6 +21,17 @@
#include <linux/vmalloc.h>
#include <linux/stringify.h>
+#include "disasm.h"
+
+static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+ [_id] = & _name ## _verifier_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
+
/* bpf_check() is a static code analyzer that walks eBPF program
* instruction by instruction and updates register/stack state.
* All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -153,28 +164,36 @@ struct bpf_call_arg_meta {
int access_size;
};
-/* verbose verifier prints what it's seeing
- * bpf_check() is called under lock, so no race to access these global vars
- */
-static u32 log_level, log_size, log_len;
-static char *log_buf;
-
static DEFINE_MUTEX(bpf_verifier_lock);
/* log_level controls verbosity level of eBPF verifier.
* verbose() is used to dump the verification trace to the log, so the user
* can figure out what's wrong with the program
*/
-static __printf(1, 2) void verbose(const char *fmt, ...)
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+ const char *fmt, ...)
{
+ struct bpf_verifer_log *log = &env->log;
+ unsigned int n;
va_list args;
- if (log_level == 0 || log_len >= log_size - 1)
+ if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
return;
va_start(args, fmt);
- log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+ n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
va_end(args);
+
+ WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+ "verifier log line truncated - local buffer too short\n");
+
+ n = min(log->len_total - log->len_used - 1, n);
+ log->kbuf[n] = '\0';
+
+ if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
+ log->len_used += n;
+ else
+ log->ubuf = NULL;
}
static bool type_is_pkt_pointer(enum bpf_reg_type type)
@@ -197,23 +216,8 @@ static const char * const reg_type_str[] = {
[PTR_TO_PACKET_END] = "pkt_end",
};
-#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
-static const char * const func_id_str[] = {
- __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
-};
-#undef __BPF_FUNC_STR_FN
-
-static const char *func_id_name(int id)
-{
- BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
-
- if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
- return func_id_str[id];
- else
- return "unknown";
-}
-
-static void print_verifier_state(struct bpf_verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state)
{
struct bpf_reg_state *reg;
enum bpf_reg_type t;
@@ -224,21 +228,21 @@ static void print_verifier_state(struct bpf_verifier_state *state)
t = reg->type;
if (t == NOT_INIT)
continue;
- verbose(" R%d=%s", i, reg_type_str[t]);
+ verbose(env, " R%d=%s", i, reg_type_str[t]);
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
tnum_is_const(reg->var_off)) {
/* reg->off should be 0 for SCALAR_VALUE */
- verbose("%lld", reg->var_off.value + reg->off);
+ verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
- verbose("(id=%d", reg->id);
+ verbose(env, "(id=%d", reg->id);
if (t != SCALAR_VALUE)
- verbose(",off=%d", reg->off);
+ verbose(env, ",off=%d", reg->off);
if (type_is_pkt_pointer(t))
- verbose(",r=%d", reg->range);
+ verbose(env, ",r=%d", reg->range);
else if (t == CONST_PTR_TO_MAP ||
t == PTR_TO_MAP_VALUE ||
t == PTR_TO_MAP_VALUE_OR_NULL)
- verbose(",ks=%d,vs=%d",
+ verbose(env, ",ks=%d,vs=%d",
reg->map_ptr->key_size,
reg->map_ptr->value_size);
if (tnum_is_const(reg->var_off)) {
@@ -246,218 +250,38 @@ static void print_verifier_state(struct bpf_verifier_state *state)
* could be a pointer whose offset is too big
* for reg->off
*/
- verbose(",imm=%llx", reg->var_off.value);
+ verbose(env, ",imm=%llx", reg->var_off.value);
} else {
if (reg->smin_value != reg->umin_value &&
reg->smin_value != S64_MIN)
- verbose(",smin_value=%lld",
+ verbose(env, ",smin_value=%lld",
(long long)reg->smin_value);
if (reg->smax_value != reg->umax_value &&
reg->smax_value != S64_MAX)
- verbose(",smax_value=%lld",
+ verbose(env, ",smax_value=%lld",
(long long)reg->smax_value);
if (reg->umin_value != 0)
- verbose(",umin_value=%llu",
+ verbose(env, ",umin_value=%llu",
(unsigned long long)reg->umin_value);
if (reg->umax_value != U64_MAX)
- verbose(",umax_value=%llu",
+ verbose(env, ",umax_value=%llu",
(unsigned long long)reg->umax_value);
if (!tnum_is_unknown(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(",var_off=%s", tn_buf);
+ verbose(env, ",var_off=%s", tn_buf);
}
}
- verbose(")");
+ verbose(env, ")");
}
}
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] == STACK_SPILL)
- verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+ verbose(env, " fp%d=%s", -MAX_BPF_STACK + i,
reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
}
- verbose("\n");
-}
-
-static const char *const bpf_class_string[] = {
- [BPF_LD] = "ld",
- [BPF_LDX] = "ldx",
- [BPF_ST] = "st",
- [BPF_STX] = "stx",
- [BPF_ALU] = "alu",
- [BPF_JMP] = "jmp",
- [BPF_RET] = "BUG",
- [BPF_ALU64] = "alu64",
-};
-
-static const char *const bpf_alu_string[16] = {
- [BPF_ADD >> 4] = "+=",
- [BPF_SUB >> 4] = "-=",
- [BPF_MUL >> 4] = "*=",
- [BPF_DIV >> 4] = "/=",
- [BPF_OR >> 4] = "|=",
- [BPF_AND >> 4] = "&=",
- [BPF_LSH >> 4] = "<<=",
- [BPF_RSH >> 4] = ">>=",
- [BPF_NEG >> 4] = "neg",
- [BPF_MOD >> 4] = "%=",
- [BPF_XOR >> 4] = "^=",
- [BPF_MOV >> 4] = "=",
- [BPF_ARSH >> 4] = "s>>=",
- [BPF_END >> 4] = "endian",
-};
-
-static const char *const bpf_ldst_string[] = {
- [BPF_W >> 3] = "u32",
- [BPF_H >> 3] = "u16",
- [BPF_B >> 3] = "u8",
- [BPF_DW >> 3] = "u64",
-};
-
-static const char *const bpf_jmp_string[16] = {
- [BPF_JA >> 4] = "jmp",
- [BPF_JEQ >> 4] = "==",
- [BPF_JGT >> 4] = ">",
- [BPF_JLT >> 4] = "<",
- [BPF_JGE >> 4] = ">=",
- [BPF_JLE >> 4] = "<=",
- [BPF_JSET >> 4] = "&",
- [BPF_JNE >> 4] = "!=",
- [BPF_JSGT >> 4] = "s>",
- [BPF_JSLT >> 4] = "s<",
- [BPF_JSGE >> 4] = "s>=",
- [BPF_JSLE >> 4] = "s<=",
- [BPF_CALL >> 4] = "call",
- [BPF_EXIT >> 4] = "exit",
-};
-
-static void print_bpf_end_insn(const struct bpf_verifier_env *env,
- const struct bpf_insn *insn)
-{
- verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
- BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
- insn->imm, insn->dst_reg);
-}
-
-static void print_bpf_insn(const struct bpf_verifier_env *env,
- const struct bpf_insn *insn)
-{
- u8 class = BPF_CLASS(insn->code);
-
- if (class == BPF_ALU || class == BPF_ALU64) {
- if (BPF_OP(insn->code) == BPF_END) {
- if (class == BPF_ALU64)
- verbose("BUG_alu64_%02x\n", insn->code);
- else
- print_bpf_end_insn(env, insn);
- } else if (BPF_OP(insn->code) == BPF_NEG) {
- verbose("(%02x) r%d = %s-r%d\n",
- insn->code, insn->dst_reg,
- class == BPF_ALU ? "(u32) " : "",
- insn->dst_reg);
- } else if (BPF_SRC(insn->code) == BPF_X) {
- verbose("(%02x) %sr%d %s %sr%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
- insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
- insn->src_reg);
- } else {
- verbose("(%02x) %sr%d %s %s%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
- insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
- insn->imm);
- }
- } else if (class == BPF_STX) {
- if (BPF_MODE(insn->code) == BPF_MEM)
- verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->src_reg);
- else if (BPF_MODE(insn->code) == BPF_XADD)
- verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg, insn->off,
- insn->src_reg);
- else
- verbose("BUG_%02x\n", insn->code);
- } else if (class == BPF_ST) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose("BUG_st_%02x\n", insn->code);
- return;
- }
- verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->imm);
- } else if (class == BPF_LDX) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose("BUG_ldx_%02x\n", insn->code);
- return;
- }
- verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
- insn->code, insn->dst_reg,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->src_reg, insn->off);
- } else if (class == BPF_LD) {
- if (BPF_MODE(insn->code) == BPF_ABS) {
- verbose("(%02x) r0 = *(%s *)skb[%d]\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IND) {
- verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->src_reg, insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IMM &&
- BPF_SIZE(insn->code) == BPF_DW) {
- /* At this point, we already made sure that the second
- * part of the ldimm64 insn is accessible.
- */
- u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
-
- if (map_ptr && !env->allow_ptr_leaks)
- imm = 0;
-
- verbose("(%02x) r%d = 0x%llx\n", insn->code,
- insn->dst_reg, (unsigned long long)imm);
- } else {
- verbose("BUG_ld_%02x\n", insn->code);
- return;
- }
- } else if (class == BPF_JMP) {
- u8 opcode = BPF_OP(insn->code);
-
- if (opcode == BPF_CALL) {
- verbose("(%02x) call %s#%d\n", insn->code,
- func_id_name(insn->imm), insn->imm);
- } else if (insn->code == (BPF_JMP | BPF_JA)) {
- verbose("(%02x) goto pc%+d\n",
- insn->code, insn->off);
- } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
- verbose("(%02x) exit\n", insn->code);
- } else if (BPF_SRC(insn->code) == BPF_X) {
- verbose("(%02x) if r%d %s r%d goto pc%+d\n",
- insn->code, insn->dst_reg,
- bpf_jmp_string[BPF_OP(insn->code) >> 4],
- insn->src_reg, insn->off);
- } else {
- verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
- insn->code, insn->dst_reg,
- bpf_jmp_string[BPF_OP(insn->code) >> 4],
- insn->imm, insn->off);
- }
- } else {
- verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
- }
+ verbose(env, "\n");
}
static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
@@ -495,7 +319,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
env->head = elem;
env->stack_size++;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
- verbose("BPF program is too complex\n");
+ verbose(env, "BPF program is too complex\n");
goto err;
}
return &elem->st;
@@ -533,10 +357,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
__mark_reg_known(reg, 0);
}
-static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_known_zero(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose("mark_reg_known_zero(regs, %u)\n", regno);
+ verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(regs + regno);
@@ -646,10 +471,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
__mark_reg_unbounded(reg);
}
-static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose("mark_reg_unknown(regs, %u)\n", regno);
+ verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(regs + regno);
@@ -664,10 +490,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg)
reg->type = NOT_INIT;
}
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_not_init(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose("mark_reg_not_init(regs, %u)\n", regno);
+ verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(regs + regno);
@@ -676,22 +503,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
__mark_reg_not_init(regs + regno);
}
-static void init_reg_state(struct bpf_reg_state *regs)
+static void init_reg_state(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs)
{
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
- mark_reg_not_init(regs, i);
+ mark_reg_not_init(env, regs, i);
regs[i].live = REG_LIVE_NONE;
}
/* frame pointer */
regs[BPF_REG_FP].type = PTR_TO_STACK;
- mark_reg_known_zero(regs, BPF_REG_FP);
+ mark_reg_known_zero(env, regs, BPF_REG_FP);
/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
- mark_reg_known_zero(regs, BPF_REG_1);
+ mark_reg_known_zero(env, regs, BPF_REG_1);
}
enum reg_arg_type {
@@ -704,6 +532,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
{
struct bpf_verifier_state *parent = state->parent;
+ if (regno == BPF_REG_FP)
+ /* We don't need to worry about FP liveness because it's read-only */
+ return;
+
while (parent) {
/* if read wasn't screened by an earlier write ... */
if (state->regs[regno].live & REG_LIVE_WRITTEN)
@@ -721,26 +553,26 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
struct bpf_reg_state *regs = env->cur_state.regs;
if (regno >= MAX_BPF_REG) {
- verbose("R%d is invalid\n", regno);
+ verbose(env, "R%d is invalid\n", regno);
return -EINVAL;
}
if (t == SRC_OP) {
/* check whether register used as source operand can be read */
if (regs[regno].type == NOT_INIT) {
- verbose("R%d !read_ok\n", regno);
+ verbose(env, "R%d !read_ok\n", regno);
return -EACCES;
}
mark_reg_read(&env->cur_state, regno);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
- verbose("frame pointer is read only\n");
+ verbose(env, "frame pointer is read only\n");
return -EACCES;
}
regs[regno].live |= REG_LIVE_WRITTEN;
if (t == DST_OP)
- mark_reg_unknown(regs, regno);
+ mark_reg_unknown(env, regs, regno);
}
return 0;
}
@@ -765,7 +597,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
-static int check_stack_write(struct bpf_verifier_state *state, int off,
+static int check_stack_write(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state, int off,
int size, int value_regno)
{
int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
@@ -778,7 +611,7 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
- verbose("invalid size of register spill\n");
+ verbose(env, "invalid size of register spill\n");
return -EACCES;
}
@@ -813,7 +646,8 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
}
}
-static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state, int off, int size,
int value_regno)
{
u8 *slot_type;
@@ -823,12 +657,12 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
if (slot_type[0] == STACK_SPILL) {
if (size != BPF_REG_SIZE) {
- verbose("invalid size of register spill\n");
+ verbose(env, "invalid size of register spill\n");
return -EACCES;
}
for (i = 1; i < BPF_REG_SIZE; i++) {
if (slot_type[i] != STACK_SPILL) {
- verbose("corrupted spill memory\n");
+ verbose(env, "corrupted spill memory\n");
return -EACCES;
}
}
@@ -844,14 +678,14 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
} else {
for (i = 0; i < size; i++) {
if (slot_type[i] != STACK_MISC) {
- verbose("invalid read from stack off %d+%d size %d\n",
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
}
}
if (value_regno >= 0)
/* have read misc data from the stack */
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, state->regs, value_regno);
return 0;
}
}
@@ -863,7 +697,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
if (off < 0 || size <= 0 || off + size > map->value_size) {
- verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+ verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
map->value_size, off, size);
return -EACCES;
}
@@ -882,8 +716,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* need to try adding each of min_value and max_value to off
* to make sure our theoretical access will be safe.
*/
- if (log_level)
- print_verifier_state(state);
+ if (env->log.level)
+ print_verifier_state(env, state);
/* The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
@@ -891,13 +725,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* will have a set floor within our range.
*/
if (reg->smin_value < 0) {
- verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
}
err = __check_map_access(env, regno, reg->smin_value + off, size);
if (err) {
- verbose("R%d min value is outside of the array range\n", regno);
+ verbose(env, "R%d min value is outside of the array range\n",
+ regno);
return err;
}
@@ -906,13 +741,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* If reg->umax_value + off could overflow, treat that as unbounded too.
*/
if (reg->umax_value >= BPF_MAX_VAR_OFF) {
- verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+ verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
regno);
return -EACCES;
}
err = __check_map_access(env, regno, reg->umax_value + off, size);
if (err)
- verbose("R%d max value is outside of the array range\n", regno);
+ verbose(env, "R%d max value is outside of the array range\n",
+ regno);
return err;
}
@@ -951,7 +787,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
struct bpf_reg_state *reg = &regs[regno];
if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
- verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+ verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
off, size, regno, reg->id, reg->off, reg->range);
return -EACCES;
}
@@ -974,13 +810,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
* detail to prove they're safe.
*/
if (reg->smin_value < 0) {
- verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
}
err = __check_packet_access(env, regno, off, size);
if (err) {
- verbose("R%d offset is outside of the packet\n", regno);
+ verbose(env, "R%d offset is outside of the packet\n", regno);
return err;
}
return err;
@@ -994,12 +830,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
.reg_type = *reg_type,
};
- /* for analyzer ctx accesses are already validated and converted */
- if (env->analyzer_ops)
- return 0;
-
- if (env->prog->aux->ops->is_valid_access &&
- env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+ if (env->ops->is_valid_access &&
+ env->ops->is_valid_access(off, size, t, &info)) {
/* A non zero info.ctx_field_size indicates that this field is a
* candidate for later verifier transformation to load the whole
* field and then apply a mask when accessed with a narrower
@@ -1007,16 +839,19 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
* will only allow for whole field access and rejects any other
* type of narrower access.
*/
- env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
*reg_type = info.reg_type;
+ if (env->analyzer_ops)
+ return 0;
+
+ env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size;
return 0;
}
- verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+ verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
return -EACCES;
}
@@ -1034,7 +869,8 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
}
-static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
int off, int size, bool strict)
{
struct tnum reg_off;
@@ -1059,7 +895,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+ verbose(env,
+ "misaligned packet access off %d+%s+%d+%d size %d\n",
ip_align, tn_buf, reg->off, off, size);
return -EACCES;
}
@@ -1067,7 +904,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
return 0;
}
-static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
const char *pointer_desc,
int off, int size, bool strict)
{
@@ -1082,7 +920,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("misaligned %saccess off %s+%d+%d size %d\n",
+ verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
pointer_desc, tn_buf, reg->off, off, size);
return -EACCES;
}
@@ -1103,7 +941,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
/* Special case, because of NET_IP_ALIGN. Given metadata sits
* right in front, treat it the very same way.
*/
- return check_pkt_ptr_alignment(reg, off, size, strict);
+ return check_pkt_ptr_alignment(env, reg, off, size, strict);
case PTR_TO_MAP_VALUE:
pointer_desc = "value ";
break;
@@ -1116,7 +954,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
default:
break;
}
- return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
+ return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
+ strict);
}
/* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -1148,20 +987,20 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (reg->type == PTR_TO_MAP_VALUE) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into map\n", value_regno);
+ verbose(env, "R%d leaks addr into map\n", value_regno);
return -EACCES;
}
err = check_map_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, state->regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = SCALAR_VALUE;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into ctx\n", value_regno);
+ verbose(env, "R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
/* ctx accesses must be at a fixed offset, so that we can
@@ -1171,7 +1010,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("variable ctx access var_off=%s off=%d size=%d",
+ verbose(env,
+ "variable ctx access var_off=%s off=%d size=%d",
tn_buf, off, size);
return -EACCES;
}
@@ -1183,9 +1023,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* case, we know the offset is zero.
*/
if (reg_type == SCALAR_VALUE)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, state->regs, value_regno);
else
- mark_reg_known_zero(state->regs, value_regno);
+ mark_reg_known_zero(env, state->regs,
+ value_regno);
state->regs[value_regno].id = 0;
state->regs[value_regno].off = 0;
state->regs[value_regno].range = 0;
@@ -1201,13 +1042,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("variable stack access var_off=%s off=%d size=%d",
+ verbose(env, "variable stack access var_off=%s off=%d size=%d",
tn_buf, off, size);
return -EACCES;
}
off += reg->var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK) {
- verbose("invalid stack off=%d size=%d\n", off, size);
+ verbose(env, "invalid stack off=%d size=%d\n", off,
+ size);
return -EACCES;
}
@@ -1218,29 +1060,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!env->allow_ptr_leaks &&
state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
size != BPF_REG_SIZE) {
- verbose("attempt to corrupt spilled pointer on stack\n");
+ verbose(env, "attempt to corrupt spilled pointer on stack\n");
return -EACCES;
}
- err = check_stack_write(state, off, size, value_regno);
+ err = check_stack_write(env, state, off, size,
+ value_regno);
} else {
- err = check_stack_read(state, off, size, value_regno);
+ err = check_stack_read(env, state, off, size,
+ value_regno);
}
} else if (reg_is_pkt_pointer(reg)) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
- verbose("cannot write into packet\n");
+ verbose(env, "cannot write into packet\n");
return -EACCES;
}
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into packet\n", value_regno);
+ verbose(env, "R%d leaks addr into packet\n",
+ value_regno);
return -EACCES;
}
err = check_packet_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, state->regs, value_regno);
} else {
- verbose("R%d invalid mem access '%s'\n",
- regno, reg_type_str[reg->type]);
+ verbose(env, "R%d invalid mem access '%s'\n", regno,
+ reg_type_str[reg->type]);
return -EACCES;
}
@@ -1260,7 +1105,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
insn->imm != 0) {
- verbose("BPF_XADD uses reserved fields\n");
+ verbose(env, "BPF_XADD uses reserved fields\n");
return -EINVAL;
}
@@ -1275,7 +1120,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
return err;
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d leaks addr into mem\n", insn->src_reg);
+ verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
return -EACCES;
}
@@ -1316,7 +1161,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
register_is_null(regs[regno]))
return 0;
- verbose("R%d type=%s expected=%s\n", regno,
+ verbose(env, "R%d type=%s expected=%s\n", regno,
reg_type_str[regs[regno].type],
reg_type_str[PTR_TO_STACK]);
return -EACCES;
@@ -1327,13 +1172,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
- verbose("invalid variable stack read R%d var_off=%s\n",
+ verbose(env, "invalid variable stack read R%d var_off=%s\n",
regno, tn_buf);
}
off = regs[regno].off + regs[regno].var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
access_size <= 0) {
- verbose("invalid stack type R%d off=%d access_size=%d\n",
+ verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
regno, off, access_size);
return -EACCES;
}
@@ -1349,7 +1194,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
for (i = 0; i < access_size; i++) {
if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
- verbose("invalid indirect read from stack off %d+%d size %d\n",
+ verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
off, i, access_size);
return -EACCES;
}
@@ -1392,7 +1237,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_ANYTHING) {
if (is_pointer_value(env, regno)) {
- verbose("R%d leaks addr into helper function\n", regno);
+ verbose(env, "R%d leaks addr into helper function\n",
+ regno);
return -EACCES;
}
return 0;
@@ -1400,7 +1246,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (type_is_pkt_pointer(type) &&
!may_access_direct_pkt_data(env, meta, BPF_READ)) {
- verbose("helper access to the packet is not allowed\n");
+ verbose(env, "helper access to the packet is not allowed\n");
return -EACCES;
}
@@ -1438,7 +1284,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
} else {
- verbose("unsupported arg_type %d\n", arg_type);
+ verbose(env, "unsupported arg_type %d\n", arg_type);
return -EFAULT;
}
@@ -1456,7 +1302,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
* we have to check map_key here. Otherwise it means
* that kernel subsystem misconfigured verifier
*/
- verbose("invalid map_ptr to access map->key\n");
+ verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
if (type_is_pkt_pointer(type))
@@ -1472,7 +1318,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (!meta->map_ptr) {
/* kernel subsystem misconfigured verifier */
- verbose("invalid map_ptr to access map->value\n");
+ verbose(env, "invalid map_ptr to access map->value\n");
return -EACCES;
}
if (type_is_pkt_pointer(type))
@@ -1492,7 +1338,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (regno == 0) {
/* kernel subsystem misconfigured verifier */
- verbose("ARG_CONST_SIZE cannot be first argument\n");
+ verbose(env,
+ "ARG_CONST_SIZE cannot be first argument\n");
return -EACCES;
}
@@ -1509,7 +1356,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
meta = NULL;
if (reg->smin_value < 0) {
- verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+ verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
regno);
return -EACCES;
}
@@ -1523,7 +1370,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
}
if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
- verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
regno);
return -EACCES;
}
@@ -1534,12 +1381,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return err;
err_type:
- verbose("R%d type=%s expected=%s\n", regno,
+ verbose(env, "R%d type=%s expected=%s\n", regno,
reg_type_str[type], reg_type_str[expected_type]);
return -EACCES;
}
-static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+static int check_map_func_compatibility(struct bpf_verifier_env *env,
+ struct bpf_map *map, int func_id)
{
if (!map)
return 0;
@@ -1552,7 +1400,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
break;
case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
if (func_id != BPF_FUNC_perf_event_read &&
- func_id != BPF_FUNC_perf_event_output)
+ func_id != BPF_FUNC_perf_event_output &&
+ func_id != BPF_FUNC_perf_event_read_value)
goto error;
break;
case BPF_MAP_TYPE_STACK_TRACE:
@@ -1572,6 +1421,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (func_id != BPF_FUNC_redirect_map)
goto error;
break;
+ /* Restrict bpf side of cpumap, open when use-cases appear */
+ case BPF_MAP_TYPE_CPUMAP:
+ if (func_id != BPF_FUNC_redirect_map)
+ goto error;
+ break;
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
if (func_id != BPF_FUNC_map_lookup_elem)
@@ -1595,6 +1449,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
break;
case BPF_FUNC_perf_event_read:
case BPF_FUNC_perf_event_output:
+ case BPF_FUNC_perf_event_read_value:
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
goto error;
break;
@@ -1608,7 +1463,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
goto error;
break;
case BPF_FUNC_redirect_map:
- if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+ if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+ map->map_type != BPF_MAP_TYPE_CPUMAP)
goto error;
break;
case BPF_FUNC_sk_redirect_map:
@@ -1625,7 +1481,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
return 0;
error:
- verbose("cannot pass map_type %d into func %s#%d\n",
+ verbose(env, "cannot pass map_type %d into func %s#%d\n",
map->map_type, func_id_name(func_id), func_id);
return -EINVAL;
}
@@ -1659,7 +1515,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
for (i = 0; i < MAX_BPF_REG; i++)
if (reg_is_pkt_pointer_any(&regs[i]))
- mark_reg_unknown(regs, i);
+ mark_reg_unknown(env, regs, i);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
@@ -1681,21 +1537,23 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
/* find function prototype */
if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
- verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
+ verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
+ func_id);
return -EINVAL;
}
- if (env->prog->aux->ops->get_func_proto)
- fn = env->prog->aux->ops->get_func_proto(func_id);
+ if (env->ops->get_func_proto)
+ fn = env->ops->get_func_proto(func_id);
if (!fn) {
- verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
+ verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
+ func_id);
return -EINVAL;
}
/* eBPF programs must be GPL compatible to use GPL-ed functions */
if (!env->prog->gpl_compatible && fn->gpl_only) {
- verbose("cannot call GPL only function from proprietary program\n");
+ verbose(env, "cannot call GPL only function from proprietary program\n");
return -EINVAL;
}
@@ -1709,7 +1567,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
*/
err = check_raw_mode(fn);
if (err) {
- verbose("kernel subsystem misconfigured func %s#%d\n",
+ verbose(env, "kernel subsystem misconfigured func %s#%d\n",
func_id_name(func_id), func_id);
return err;
}
@@ -1742,14 +1600,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(regs, caller_saved[i]);
+ mark_reg_not_init(env, regs, caller_saved[i]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
/* update return register (already marked as written above) */
if (fn->ret_type == RET_INTEGER) {
/* sets type to SCALAR_VALUE */
- mark_reg_unknown(regs, BPF_REG_0);
+ mark_reg_unknown(env, regs, BPF_REG_0);
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1757,14 +1615,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
/* There is no offset yet applied, variable or fixed */
- mark_reg_known_zero(regs, BPF_REG_0);
+ mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].off = 0;
/* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
*/
if (meta.map_ptr == NULL) {
- verbose("kernel subsystem misconfigured verifier\n");
+ verbose(env,
+ "kernel subsystem misconfigured verifier\n");
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
@@ -1775,12 +1634,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
else if (insn_aux->map_ptr != meta.map_ptr)
insn_aux->map_ptr = BPF_MAP_PTR_POISON;
} else {
- verbose("unknown return type %d of func %s#%d\n",
+ verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
return -EINVAL;
}
- err = check_map_func_compatibility(meta.map_ptr, func_id);
+ err = check_map_func_compatibility(env, meta.map_ptr, func_id);
if (err)
return err;
@@ -1839,39 +1698,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg = &regs[dst];
if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: known but bad sbounds\n");
+ print_verifier_state(env, &env->cur_state);
+ verbose(env,
+ "verifier internal error: known but bad sbounds\n");
return -EINVAL;
}
if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: known but bad ubounds\n");
+ print_verifier_state(env, &env->cur_state);
+ verbose(env,
+ "verifier internal error: known but bad ubounds\n");
return -EINVAL;
}
if (BPF_CLASS(insn->code) != BPF_ALU64) {
/* 32-bit ALU ops on pointers produce (meaningless) scalars */
if (!env->allow_ptr_leaks)
- verbose("R%d 32-bit pointer arithmetic prohibited\n",
+ verbose(env,
+ "R%d 32-bit pointer arithmetic prohibited\n",
dst);
return -EACCES;
}
if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+ verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
dst);
return -EACCES;
}
if (ptr_reg->type == CONST_PTR_TO_MAP) {
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+ verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
dst);
return -EACCES;
}
if (ptr_reg->type == PTR_TO_PACKET_END) {
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+ verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
dst);
return -EACCES;
}
@@ -1936,7 +1798,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (dst_reg == off_reg) {
/* scalar -= pointer. Creates an unknown scalar */
if (!env->allow_ptr_leaks)
- verbose("R%d tried to subtract pointer from scalar\n",
+ verbose(env, "R%d tried to subtract pointer from scalar\n",
dst);
return -EACCES;
}
@@ -1946,7 +1808,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
*/
if (ptr_reg->type == PTR_TO_STACK) {
if (!env->allow_ptr_leaks)
- verbose("R%d subtraction from stack pointer prohibited\n",
+ verbose(env, "R%d subtraction from stack pointer prohibited\n",
dst);
return -EACCES;
}
@@ -2001,13 +1863,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
* ptr &= ~3 which would reduce min_value by 3.)
*/
if (!env->allow_ptr_leaks)
- verbose("R%d bitwise operator %s on pointer prohibited\n",
+ verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
dst, bpf_alu_string[opcode >> 4]);
return -EACCES;
default:
/* other operators (e.g. MUL,LSH) produce non-pointer results */
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic with %s operator prohibited\n",
+ verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
dst, bpf_alu_string[opcode >> 4]);
return -EACCES;
}
@@ -2173,7 +2035,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* Shifts greater than 63 are undefined. This includes
* shifts by a negative number.
*/
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
/* We lose all sign bit information (except what we can pick
@@ -2201,7 +2063,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* Shifts greater than 63 are undefined. This includes
* shifts by a negative number.
*/
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
/* BPF_RSH is an unsigned shift, so make the appropriate casts */
@@ -2229,7 +2091,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
__update_reg_bounds(dst_reg);
break;
default:
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
@@ -2261,12 +2123,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* an arbitrary scalar.
*/
if (!env->allow_ptr_leaks) {
- verbose("R%d pointer %s pointer prohibited\n",
+ verbose(env, "R%d pointer %s pointer prohibited\n",
insn->dst_reg,
bpf_alu_string[opcode >> 4]);
return -EACCES;
}
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
return 0;
} else {
/* scalar += pointer
@@ -2318,13 +2180,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: unexpected ptr_reg\n");
+ print_verifier_state(env, &env->cur_state);
+ verbose(env, "verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: no src_reg\n");
+ print_verifier_state(env, &env->cur_state);
+ verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
@@ -2342,14 +2204,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->off != 0 || insn->imm != 0) {
- verbose("BPF_NEG uses reserved fields\n");
+ verbose(env, "BPF_NEG uses reserved fields\n");
return -EINVAL;
}
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
(insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
BPF_CLASS(insn->code) == BPF_ALU64) {
- verbose("BPF_END uses reserved fields\n");
+ verbose(env, "BPF_END uses reserved fields\n");
return -EINVAL;
}
}
@@ -2360,7 +2222,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
+ verbose(env, "R%d pointer arithmetic prohibited\n",
insn->dst_reg);
return -EACCES;
}
@@ -2374,7 +2236,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0 || insn->off != 0) {
- verbose("BPF_MOV uses reserved fields\n");
+ verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
@@ -2384,7 +2246,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose("BPF_MOV uses reserved fields\n");
+ verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
}
@@ -2400,14 +2262,16 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* copy register state to dest reg
*/
regs[insn->dst_reg] = regs[insn->src_reg];
+ regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;
} else {
/* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d partial copy of pointer\n",
+ verbose(env,
+ "R%d partial copy of pointer\n",
insn->src_reg);
return -EACCES;
}
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
/* high 32 bits are known zero. */
regs[insn->dst_reg].var_off = tnum_cast(
regs[insn->dst_reg].var_off, 4);
@@ -2422,14 +2286,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
} else if (opcode > BPF_END) {
- verbose("invalid BPF_ALU opcode %x\n", opcode);
+ verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
return -EINVAL;
} else { /* all other ALU ops: and, sub, xor, add, ... */
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0 || insn->off != 0) {
- verbose("BPF_ALU uses reserved fields\n");
+ verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
/* check src1 operand */
@@ -2438,7 +2302,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose("BPF_ALU uses reserved fields\n");
+ verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
}
@@ -2450,7 +2314,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
- verbose("div by zero\n");
+ verbose(env, "div by zero\n");
return -EINVAL;
}
@@ -2459,7 +2323,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
if (insn->imm < 0 || insn->imm >= size) {
- verbose("invalid shift %d\n", insn->imm);
+ verbose(env, "invalid shift %d\n", insn->imm);
return -EINVAL;
}
}
@@ -2812,13 +2676,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
int err;
if (opcode > BPF_JSLE) {
- verbose("invalid BPF_JMP opcode %x\n", opcode);
+ verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
return -EINVAL;
}
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
- verbose("BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP uses reserved fields\n");
return -EINVAL;
}
@@ -2828,13 +2692,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return err;
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d pointer comparison prohibited\n",
+ verbose(env, "R%d pointer comparison prohibited\n",
insn->src_reg);
return -EACCES;
}
} else {
if (insn->src_reg != BPF_REG_0) {
- verbose("BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP uses reserved fields\n");
return -EINVAL;
}
}
@@ -2946,11 +2810,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
PTR_TO_PACKET_META);
} else if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+ verbose(env, "R%d pointer comparison prohibited\n",
+ insn->dst_reg);
return -EACCES;
}
- if (log_level)
- print_verifier_state(this_branch);
+ if (env->log.level)
+ print_verifier_state(env, this_branch);
return 0;
}
@@ -2969,11 +2834,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
int err;
if (BPF_SIZE(insn->code) != BPF_DW) {
- verbose("invalid BPF_LD_IMM insn\n");
+ verbose(env, "invalid BPF_LD_IMM insn\n");
return -EINVAL;
}
if (insn->off != 0) {
- verbose("BPF_LD_IMM64 uses reserved fields\n");
+ verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
return -EINVAL;
}
@@ -3031,14 +2896,14 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
int i, err;
if (!may_access_skb(env->prog->type)) {
- verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
+ verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
return -EINVAL;
}
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
BPF_SIZE(insn->code) == BPF_DW ||
(mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
- verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
+ verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
return -EINVAL;
}
@@ -3048,7 +2913,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
if (regs[BPF_REG_6].type != PTR_TO_CTX) {
- verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+ verbose(env,
+ "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
return -EINVAL;
}
@@ -3061,7 +2927,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* reset caller saved regs to unreadable */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(regs, caller_saved[i]);
+ mark_reg_not_init(env, regs, caller_saved[i]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
@@ -3069,7 +2935,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
* the value fetched from the packet.
* Already marked as written above.
*/
- mark_reg_unknown(regs, BPF_REG_0);
+ mark_reg_unknown(env, regs, BPF_REG_0);
return 0;
}
@@ -3089,22 +2955,22 @@ static int check_return_code(struct bpf_verifier_env *env)
reg = &env->cur_state.regs[BPF_REG_0];
if (reg->type != SCALAR_VALUE) {
- verbose("At program exit the register R0 is not a known value (%s)\n",
+ verbose(env, "At program exit the register R0 is not a known value (%s)\n",
reg_type_str[reg->type]);
return -EINVAL;
}
if (!tnum_in(range, reg->var_off)) {
- verbose("At program exit the register R0 ");
+ verbose(env, "At program exit the register R0 ");
if (!tnum_is_unknown(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("has value %s", tn_buf);
+ verbose(env, "has value %s", tn_buf);
} else {
- verbose("has unknown scalar value");
+ verbose(env, "has unknown scalar value");
}
- verbose(" should have been 0 or 1\n");
+ verbose(env, " should have been 0 or 1\n");
return -EINVAL;
}
return 0;
@@ -3170,7 +3036,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
return 0;
if (w < 0 || w >= env->prog->len) {
- verbose("jump out of range from insn %d to %d\n", t, w);
+ verbose(env, "jump out of range from insn %d to %d\n", t, w);
return -EINVAL;
}
@@ -3187,13 +3053,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
insn_stack[cur_stack++] = w;
return 1;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
- verbose("back-edge from insn %d to %d\n", t, w);
+ verbose(env, "back-edge from insn %d to %d\n", t, w);
return -EINVAL;
} else if (insn_state[w] == EXPLORED) {
/* forward- or cross-edge */
insn_state[t] = DISCOVERED | e;
} else {
- verbose("insn state internal bug\n");
+ verbose(env, "insn state internal bug\n");
return -EFAULT;
}
return 0;
@@ -3287,7 +3153,7 @@ peek_stack:
mark_explored:
insn_state[t] = EXPLORED;
if (cur_stack-- <= 0) {
- verbose("pop stack internal bug\n");
+ verbose(env, "pop stack internal bug\n");
ret = -EFAULT;
goto err_free;
}
@@ -3296,7 +3162,7 @@ mark_explored:
check_state:
for (i = 0; i < insn_cnt; i++) {
if (insn_state[i] != EXPLORED) {
- verbose("unreachable insn %d\n", i);
+ verbose(env, "unreachable insn %d\n", i);
ret = -EINVAL;
goto err_free;
}
@@ -3677,7 +3543,7 @@ static int do_check(struct bpf_verifier_env *env)
int insn_processed = 0;
bool do_print_state = false;
- init_reg_state(regs);
+ init_reg_state(env, regs);
state->parent = NULL;
insn_idx = 0;
for (;;) {
@@ -3686,7 +3552,7 @@ static int do_check(struct bpf_verifier_env *env)
int err;
if (insn_idx >= insn_cnt) {
- verbose("invalid insn idx %d insn_cnt %d\n",
+ verbose(env, "invalid insn idx %d insn_cnt %d\n",
insn_idx, insn_cnt);
return -EFAULT;
}
@@ -3695,7 +3561,8 @@ static int do_check(struct bpf_verifier_env *env)
class = BPF_CLASS(insn->code);
if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
- verbose("BPF program is too large. Processed %d insn\n",
+ verbose(env,
+ "BPF program is too large. Processed %d insn\n",
insn_processed);
return -E2BIG;
}
@@ -3705,12 +3572,12 @@ static int do_check(struct bpf_verifier_env *env)
return err;
if (err == 1) {
/* found equivalent state, can prune the search */
- if (log_level) {
+ if (env->log.level) {
if (do_print_state)
- verbose("\nfrom %d to %d: safe\n",
+ verbose(env, "\nfrom %d to %d: safe\n",
prev_insn_idx, insn_idx);
else
- verbose("%d: safe\n", insn_idx);
+ verbose(env, "%d: safe\n", insn_idx);
}
goto process_bpf_exit;
}
@@ -3718,19 +3585,20 @@ static int do_check(struct bpf_verifier_env *env)
if (need_resched())
cond_resched();
- if (log_level > 1 || (log_level && do_print_state)) {
- if (log_level > 1)
- verbose("%d:", insn_idx);
+ if (env->log.level > 1 || (env->log.level && do_print_state)) {
+ if (env->log.level > 1)
+ verbose(env, "%d:", insn_idx);
else
- verbose("\nfrom %d to %d:",
+ verbose(env, "\nfrom %d to %d:",
prev_insn_idx, insn_idx);
- print_verifier_state(&env->cur_state);
+ print_verifier_state(env, &env->cur_state);
do_print_state = false;
}
- if (log_level) {
- verbose("%d: ", insn_idx);
- print_bpf_insn(env, insn);
+ if (env->log.level) {
+ verbose(env, "%d: ", insn_idx);
+ print_bpf_insn(verbose, env, insn,
+ env->allow_ptr_leaks);
}
err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
@@ -3786,7 +3654,7 @@ static int do_check(struct bpf_verifier_env *env)
* src_reg == stack|map in some other branch.
* Reject it.
*/
- verbose("same insn cannot be used with different pointers\n");
+ verbose(env, "same insn cannot be used with different pointers\n");
return -EINVAL;
}
@@ -3826,14 +3694,14 @@ static int do_check(struct bpf_verifier_env *env)
} else if (dst_reg_type != *prev_dst_type &&
(dst_reg_type == PTR_TO_CTX ||
*prev_dst_type == PTR_TO_CTX)) {
- verbose("same insn cannot be used with different pointers\n");
+ verbose(env, "same insn cannot be used with different pointers\n");
return -EINVAL;
}
} else if (class == BPF_ST) {
if (BPF_MODE(insn->code) != BPF_MEM ||
insn->src_reg != BPF_REG_0) {
- verbose("BPF_ST uses reserved fields\n");
+ verbose(env, "BPF_ST uses reserved fields\n");
return -EINVAL;
}
/* check src operand */
@@ -3856,7 +3724,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->off != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_CALL uses reserved fields\n");
+ verbose(env, "BPF_CALL uses reserved fields\n");
return -EINVAL;
}
@@ -3869,7 +3737,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_JA uses reserved fields\n");
+ verbose(env, "BPF_JA uses reserved fields\n");
return -EINVAL;
}
@@ -3881,7 +3749,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_EXIT uses reserved fields\n");
+ verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}
@@ -3896,7 +3764,7 @@ static int do_check(struct bpf_verifier_env *env)
return err;
if (is_pointer_value(env, BPF_REG_0)) {
- verbose("R0 leaks addr as return value\n");
+ verbose(env, "R0 leaks addr as return value\n");
return -EACCES;
}
@@ -3931,19 +3799,19 @@ process_bpf_exit:
insn_idx++;
} else {
- verbose("invalid BPF_LD mode\n");
+ verbose(env, "invalid BPF_LD mode\n");
return -EINVAL;
}
} else {
- verbose("unknown insn class %d\n", class);
+ verbose(env, "unknown insn class %d\n", class);
return -EINVAL;
}
insn_idx++;
}
- verbose("processed %d insns, stack depth %d\n",
- insn_processed, env->prog->aux->stack_depth);
+ verbose(env, "processed %d insns, stack depth %d\n", insn_processed,
+ env->prog->aux->stack_depth);
return 0;
}
@@ -3955,7 +3823,8 @@ static int check_map_prealloc(struct bpf_map *map)
!(map->map_flags & BPF_F_NO_PREALLOC);
}
-static int check_map_prog_compatibility(struct bpf_map *map,
+static int check_map_prog_compatibility(struct bpf_verifier_env *env,
+ struct bpf_map *map,
struct bpf_prog *prog)
{
@@ -3966,12 +3835,12 @@ static int check_map_prog_compatibility(struct bpf_map *map,
*/
if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
if (!check_map_prealloc(map)) {
- verbose("perf_event programs can only use preallocated hash map\n");
+ verbose(env, "perf_event programs can only use preallocated hash map\n");
return -EINVAL;
}
if (map->inner_map_meta &&
!check_map_prealloc(map->inner_map_meta)) {
- verbose("perf_event programs can only use preallocated inner hash map\n");
+ verbose(env, "perf_event programs can only use preallocated inner hash map\n");
return -EINVAL;
}
}
@@ -3994,14 +3863,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
- verbose("BPF_LDX uses reserved fields\n");
+ verbose(env, "BPF_LDX uses reserved fields\n");
return -EINVAL;
}
if (BPF_CLASS(insn->code) == BPF_STX &&
((BPF_MODE(insn->code) != BPF_MEM &&
BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
- verbose("BPF_STX uses reserved fields\n");
+ verbose(env, "BPF_STX uses reserved fields\n");
return -EINVAL;
}
@@ -4012,7 +3881,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
insn[1].off != 0) {
- verbose("invalid bpf_ld_imm64 insn\n");
+ verbose(env, "invalid bpf_ld_imm64 insn\n");
return -EINVAL;
}
@@ -4021,19 +3890,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
goto next_insn;
if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
- verbose("unrecognized bpf_ld_imm64 insn\n");
+ verbose(env,
+ "unrecognized bpf_ld_imm64 insn\n");
return -EINVAL;
}
f = fdget(insn->imm);
map = __bpf_map_get(f);
if (IS_ERR(map)) {
- verbose("fd %d is not pointing to valid bpf_map\n",
+ verbose(env, "fd %d is not pointing to valid bpf_map\n",
insn->imm);
return PTR_ERR(map);
}
- err = check_map_prog_compatibility(map, env->prog);
+ err = check_map_prog_compatibility(env, map, env->prog);
if (err) {
fdput(f);
return err;
@@ -4142,7 +4012,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
*/
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
- const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+ const struct bpf_verifier_ops *ops = env->ops;
int i, cnt, size, ctx_field_size, delta = 0;
const int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16], *insn;
@@ -4155,7 +4025,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
env->prog);
if (cnt >= ARRAY_SIZE(insn_buf)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
} else if (cnt) {
new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
@@ -4203,7 +4073,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
u8 size_code;
if (type == BPF_WRITE) {
- verbose("bpf verifier narrow ctx access misconfigured\n");
+ verbose(env, "bpf verifier narrow ctx access misconfigured\n");
return -EINVAL;
}
@@ -4222,7 +4092,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
&target_size);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
(ctx_field_size && !target_size)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -4304,7 +4174,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -4343,12 +4213,13 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn = new_prog->insnsi + i + delta;
}
patch_call_imm:
- fn = prog->aux->ops->get_func_proto(insn->imm);
+ fn = env->ops->get_func_proto(insn->imm);
/* all functions that have prototype and verifier allowed
* programs to call them, must be real in-kernel functions
*/
if (!fn->func) {
- verbose("kernel subsystem misconfigured func %s#%d\n",
+ verbose(env,
+ "kernel subsystem misconfigured func %s#%d\n",
func_id_name(insn->imm), insn->imm);
return -EFAULT;
}
@@ -4382,8 +4253,8 @@ static void free_states(struct bpf_verifier_env *env)
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
- char __user *log_ubuf = NULL;
struct bpf_verifier_env *env;
+ struct bpf_verifer_log *log;
int ret = -EINVAL;
/* 'struct bpf_verifier_env' can be global, but since it's not small,
@@ -4392,6 +4263,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
+ log = &env->log;
env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
(*prog)->len);
@@ -4399,6 +4271,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (!env->insn_aux_data)
goto err_free_env;
env->prog = *prog;
+ env->ops = bpf_verifier_ops[env->prog->type];
/* grab the mutex to protect few globals used by verifier */
mutex_lock(&bpf_verifier_lock);
@@ -4407,23 +4280,15 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
/* user requested verbose verifier output
* and supplied buffer to store the verification trace
*/
- log_level = attr->log_level;
- log_ubuf = (char __user *) (unsigned long) attr->log_buf;
- log_size = attr->log_size;
- log_len = 0;
+ log->level = attr->log_level;
+ log->ubuf = (char __user *) (unsigned long) attr->log_buf;
+ log->len_total = attr->log_size;
ret = -EINVAL;
- /* log_* values have to be sane */
- if (log_size < 128 || log_size > UINT_MAX >> 8 ||
- log_level == 0 || log_ubuf == NULL)
+ /* log attributes have to be sane */
+ if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+ !log->level || !log->ubuf)
goto err_unlock;
-
- ret = -ENOMEM;
- log_buf = vmalloc(log_size);
- if (!log_buf)
- goto err_unlock;
- } else {
- log_level = 0;
}
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4460,17 +4325,11 @@ skip_full_check:
if (ret == 0)
ret = fixup_bpf_calls(env);
- if (log_level && log_len >= log_size - 1) {
- BUG_ON(log_len >= log_size);
- /* verifier log exceeded user supplied buffer */
+ if (log->level && bpf_verifier_log_full(log))
ret = -ENOSPC;
- /* fall through to return what was recorded */
- }
-
- /* copy verifier log back to user space including trailing zero */
- if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+ if (log->level && !log->ubuf) {
ret = -EFAULT;
- goto free_log_buf;
+ goto err_release_maps;
}
if (ret == 0 && env->used_map_cnt) {
@@ -4481,7 +4340,7 @@ skip_full_check:
if (!env->prog->aux->used_maps) {
ret = -ENOMEM;
- goto free_log_buf;
+ goto err_release_maps;
}
memcpy(env->prog->aux->used_maps, env->used_maps,
@@ -4494,9 +4353,7 @@ skip_full_check:
convert_pseudo_ld_imm64(env);
}
-free_log_buf:
- if (log_level)
- vfree(log_buf);
+err_release_maps:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_bpf_prog_info() will release them.
@@ -4511,12 +4368,21 @@ err_free_env:
return ret;
}
+static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = {
+ [BPF_PROG_TYPE_XDP] = &xdp_analyzer_ops,
+ [BPF_PROG_TYPE_SCHED_CLS] = &tc_cls_act_analyzer_ops,
+};
+
int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
void *priv)
{
struct bpf_verifier_env *env;
int ret;
+ if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) ||
+ !bpf_analyzer_ops[prog->type])
+ return -EOPNOTSUPP;
+
env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
@@ -4527,14 +4393,13 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
if (!env->insn_aux_data)
goto err_free_env;
env->prog = prog;
+ env->ops = bpf_analyzer_ops[env->prog->type];
env->analyzer_ops = ops;
env->analyzer_priv = priv;
/* grab the mutex to protect few globals used by verifier */
mutex_lock(&bpf_verifier_lock);
- log_level = 0;
-
env->strict_alignment = false;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8de11a29e495..d851df22f5c5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -24,6 +24,7 @@
#include <linux/lockdep.h>
#include <linux/tick.h>
#include <linux/irq.h>
+#include <linux/nmi.h>
#include <linux/smpboot.h>
#include <linux/relay.h>
#include <linux/slab.h>
@@ -897,6 +898,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
out:
cpus_write_unlock();
+ /*
+ * Do post unplug cleanup. This is still protected against
+ * concurrent CPU hotplug via cpu_add_remove_lock.
+ */
+ lockup_detector_cleanup();
return ret;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6bc21e202ae4..902149f05381 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event)
* will not be local and we cannot read them atomically
* - must not have a pmu::count method
*/
-int perf_event_read_local(struct perf_event *event, u64 *value)
+int perf_event_read_local(struct perf_event *event, u64 *value,
+ u64 *enabled, u64 *running)
{
unsigned long flags;
int ret = 0;
+ u64 now;
/*
* Disabling interrupts avoids all counter scheduling (context
@@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
goto out;
}
+ now = event->shadow_ctx_time + perf_clock();
+ if (enabled)
+ *enabled = now - event->tstamp_enabled;
/*
* If the event is currently on this CPU, its either a per-task event,
* or local to this CPU. Furthermore it means its ACTIVE (otherwise
* oncpu == -1).
*/
- if (event->oncpu == smp_processor_id())
+ if (event->oncpu == smp_processor_id()) {
event->pmu->read(event);
+ if (running)
+ *running = now - event->tstamp_running;
+ } else if (running) {
+ *running = event->total_time_running;
+ }
*value = local64_read(&event->count);
out:
@@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event,
struct bpf_perf_event_data_kern ctx = {
.data = data,
.regs = regs,
+ .event = event,
};
int ret = 0;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 1d71c051a951..5043e7433f4b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -344,39 +344,30 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
* by the client, but only by calling this function.
* This function can only be called on a registered smp_hotplug_thread.
*/
-int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
- const struct cpumask *new)
+void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *new)
{
struct cpumask *old = plug_thread->cpumask;
- cpumask_var_t tmp;
+ static struct cpumask tmp;
unsigned int cpu;
- if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
- return -ENOMEM;
-
- get_online_cpus();
+ lockdep_assert_cpus_held();
mutex_lock(&smpboot_threads_lock);
/* Park threads that were exclusively enabled on the old mask. */
- cpumask_andnot(tmp, old, new);
- for_each_cpu_and(cpu, tmp, cpu_online_mask)
+ cpumask_andnot(&tmp, old, new);
+ for_each_cpu_and(cpu, &tmp, cpu_online_mask)
smpboot_park_thread(plug_thread, cpu);
/* Unpark threads that are exclusively enabled on the new mask. */
- cpumask_andnot(tmp, new, old);
- for_each_cpu_and(cpu, tmp, cpu_online_mask)
+ cpumask_andnot(&tmp, new, old);
+ for_each_cpu_and(cpu, &tmp, cpu_online_mask)
smpboot_unpark_thread(plug_thread, cpu);
cpumask_copy(old, new);
mutex_unlock(&smpboot_threads_lock);
- put_online_cpus();
-
- free_cpumask_var(tmp);
-
- return 0;
}
-EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4da9e622471f..d9c31bc2eaea 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -872,9 +872,9 @@ static struct ctl_table kern_table[] = {
#if defined(CONFIG_LOCKUP_DETECTOR)
{
.procname = "watchdog",
- .data = &watchdog_user_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
+ .data = &watchdog_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = proc_watchdog,
.extra1 = &zero,
.extra2 = &one,
@@ -890,16 +890,12 @@ static struct ctl_table kern_table[] = {
},
{
.procname = "nmi_watchdog",
- .data = &nmi_watchdog_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
+ .data = &nmi_watchdog_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = NMI_WATCHDOG_SYSCTL_PERM,
.proc_handler = proc_nmi_watchdog,
.extra1 = &zero,
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
.extra2 = &one,
-#else
- .extra2 = &zero,
-#endif
},
{
.procname = "watchdog_cpumask",
@@ -911,9 +907,9 @@ static struct ctl_table kern_table[] = {
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
{
.procname = "soft_watchdog",
- .data = &soft_watchdog_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
+ .data = &soft_watchdog_user_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = proc_soft_watchdog,
.extra1 = &zero,
.extra2 = &one,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index dc498b605d5d..3126da2f468a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -255,14 +255,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
-BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+static __always_inline int
+get_map_perf_counter(struct bpf_map *map, u64 flags,
+ u64 *value, u64 *enabled, u64 *running)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
struct bpf_event_entry *ee;
- u64 value = 0;
- int err;
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -275,7 +275,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
if (!ee)
return -ENOENT;
- err = perf_event_read_local(ee->event, &value);
+ return perf_event_read_local(ee->event, value, enabled, running);
+}
+
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+{
+ u64 value = 0;
+ int err;
+
+ err = get_map_perf_counter(map, flags, &value, NULL, NULL);
/*
* this api is ugly since we miss [-22..-2] range of valid
* counter values, but that's uapi
@@ -293,6 +301,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
+ struct bpf_perf_event_value *, buf, u32, size)
+{
+ int err = -EINVAL;
+
+ if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+ goto clear;
+ err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
+ &buf->running);
+ if (unlikely(err))
+ goto clear;
+ return 0;
+clear:
+ memset(buf, 0, size);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
+ .func = bpf_perf_event_read_value,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
static __always_inline u64
@@ -499,6 +534,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_perf_event_output_proto;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
+ case BPF_FUNC_perf_event_read_value:
+ return &bpf_perf_event_read_value_proto;
default:
return tracing_func_proto(func_id);
}
@@ -524,11 +561,14 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
return true;
}
-const struct bpf_verifier_ops kprobe_prog_ops = {
+const struct bpf_verifier_ops kprobe_verifier_ops = {
.get_func_proto = kprobe_prog_func_proto,
.is_valid_access = kprobe_prog_is_valid_access,
};
+const struct bpf_prog_ops kprobe_prog_ops = {
+};
+
BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
@@ -576,6 +616,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx,
+ struct bpf_perf_event_value *, buf, u32, size)
+{
+ int err = -EINVAL;
+
+ if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+ goto clear;
+ err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
+ &buf->running);
+ if (unlikely(err))
+ goto clear;
+ return 0;
+clear:
+ memset(buf, 0, size);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
+ .func = bpf_perf_prog_read_value_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -583,6 +649,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_tp;
+ case BPF_FUNC_perf_prog_read_value:
+ return &bpf_perf_prog_read_value_proto_tp;
default:
return tracing_func_proto(func_id);
}
@@ -602,11 +670,14 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
return true;
}
-const struct bpf_verifier_ops tracepoint_prog_ops = {
+const struct bpf_verifier_ops tracepoint_verifier_ops = {
.get_func_proto = tp_prog_func_proto,
.is_valid_access = tp_prog_is_valid_access,
};
+const struct bpf_prog_ops tracepoint_prog_ops = {
+};
+
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
@@ -662,8 +733,11 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
-const struct bpf_verifier_ops perf_event_prog_ops = {
+const struct bpf_verifier_ops perf_event_verifier_ops = {
.get_func_proto = tp_prog_func_proto,
.is_valid_access = pe_prog_is_valid_access,
.convert_ctx_access = pe_prog_convert_ctx_access,
};
+
+const struct bpf_prog_ops perf_event_prog_ops = {
+};
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f5d52024f6b7..6bcb854909c0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,20 +29,29 @@
#include <linux/kvm_para.h>
#include <linux/kthread.h>
-/* Watchdog configuration */
-static DEFINE_MUTEX(watchdog_proc_mutex);
-
-int __read_mostly nmi_watchdog_enabled;
+static DEFINE_MUTEX(watchdog_mutex);
#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
- NMI_WATCHDOG_ENABLED;
+# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED)
+# define NMI_WATCHDOG_DEFAULT 1
#else
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED)
+# define NMI_WATCHDOG_DEFAULT 0
#endif
+unsigned long __read_mostly watchdog_enabled;
+int __read_mostly watchdog_user_enabled = 1;
+int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
+int __read_mostly soft_watchdog_user_enabled = 1;
+int __read_mostly watchdog_thresh = 10;
+int __read_mostly nmi_watchdog_available;
+
+struct cpumask watchdog_allowed_mask __read_mostly;
+
+struct cpumask watchdog_cpumask __read_mostly;
+unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
+
#ifdef CONFIG_HARDLOCKUP_DETECTOR
-/* boot commands */
/*
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
@@ -56,9 +65,9 @@ unsigned int __read_mostly hardlockup_panic =
* kernel command line parameters are parsed, because otherwise it is not
* possible to override this in hardlockup_panic_setup().
*/
-void hardlockup_detector_disable(void)
+void __init hardlockup_detector_disable(void)
{
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ nmi_watchdog_user_enabled = 0;
}
static int __init hardlockup_panic_setup(char *str)
@@ -68,48 +77,24 @@ static int __init hardlockup_panic_setup(char *str)
else if (!strncmp(str, "nopanic", 7))
hardlockup_panic = 0;
else if (!strncmp(str, "0", 1))
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ nmi_watchdog_user_enabled = 0;
else if (!strncmp(str, "1", 1))
- watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+ nmi_watchdog_user_enabled = 1;
return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
-#endif
-
-#ifdef CONFIG_SOFTLOCKUP_DETECTOR
-int __read_mostly soft_watchdog_enabled;
-#endif
-
-int __read_mostly watchdog_user_enabled;
-int __read_mostly watchdog_thresh = 10;
-
-#ifdef CONFIG_SMP
-int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+# ifdef CONFIG_SMP
int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
-#endif
-struct cpumask watchdog_cpumask __read_mostly;
-unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
-/*
- * The 'watchdog_running' variable is set to 1 when the watchdog threads
- * are registered/started and is set to 0 when the watchdog threads are
- * unregistered/stopped, so it is an indicator whether the threads exist.
- */
-static int __read_mostly watchdog_running;
-/*
- * If a subsystem has a need to deactivate the watchdog temporarily, it
- * can use the suspend/resume interface to achieve this. The content of
- * the 'watchdog_suspended' variable reflects this state. Existing threads
- * are parked/unparked by the lockup_detector_{suspend|resume} functions
- * (see comment blocks pertaining to those functions for further details).
- *
- * 'watchdog_suspended' also prevents threads from being registered/started
- * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
- * of 'watchdog_running' cannot change while the watchdog is deactivated
- * temporarily (see related code in 'proc' handlers).
- */
-int __read_mostly watchdog_suspended;
+static int __init hardlockup_all_cpu_backtrace_setup(char *str)
+{
+ sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
+ return 1;
+}
+__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
+# endif /* CONFIG_SMP */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
/*
* These functions can be overridden if an architecture implements its
@@ -121,36 +106,68 @@ int __read_mostly watchdog_suspended;
*/
int __weak watchdog_nmi_enable(unsigned int cpu)
{
+ hardlockup_detector_perf_enable();
return 0;
}
+
void __weak watchdog_nmi_disable(unsigned int cpu)
{
+ hardlockup_detector_perf_disable();
}
-/*
- * watchdog_nmi_reconfigure can be implemented to be notified after any
- * watchdog configuration change. The arch hardlockup watchdog should
- * respond to the following variables:
- * - nmi_watchdog_enabled
+/* Return 0, if a NMI watchdog is available. Error code otherwise */
+int __weak __init watchdog_nmi_probe(void)
+{
+ return hardlockup_detector_perf_init();
+}
+
+/**
+ * watchdog_nmi_stop - Stop the watchdog for reconfiguration
+ *
+ * The reconfiguration steps are:
+ * watchdog_nmi_stop();
+ * update_variables();
+ * watchdog_nmi_start();
+ */
+void __weak watchdog_nmi_stop(void) { }
+
+/**
+ * watchdog_nmi_start - Start the watchdog after reconfiguration
+ *
+ * Counterpart to watchdog_nmi_stop().
+ *
+ * The following variables have been updated in update_variables() and
+ * contain the currently valid configuration:
+ * - watchdog_enabled
* - watchdog_thresh
* - watchdog_cpumask
- * - sysctl_hardlockup_all_cpu_backtrace
- * - hardlockup_panic
- * - watchdog_suspended
*/
-void __weak watchdog_nmi_reconfigure(void)
+void __weak watchdog_nmi_start(void) { }
+
+/**
+ * lockup_detector_update_enable - Update the sysctl enable bit
+ *
+ * Caller needs to make sure that the NMI/perf watchdogs are off, so this
+ * can't race with watchdog_nmi_disable().
+ */
+static void lockup_detector_update_enable(void)
{
+ watchdog_enabled = 0;
+ if (!watchdog_user_enabled)
+ return;
+ if (nmi_watchdog_available && nmi_watchdog_user_enabled)
+ watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+ if (soft_watchdog_user_enabled)
+ watchdog_enabled |= SOFT_WATCHDOG_ENABLED;
}
-
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
-/* Helper for online, unparked cpus. */
-#define for_each_watchdog_cpu(cpu) \
- for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
-
-atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
+/* Global variables, exported for sysctl */
+unsigned int __read_mostly softlockup_panic =
+ CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+static bool softlockup_threads_initialized __read_mostly;
static u64 __read_mostly sample_period;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -164,50 +181,40 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static unsigned long soft_lockup_nmi_warn;
-unsigned int __read_mostly softlockup_panic =
- CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-
static int __init softlockup_panic_setup(char *str)
{
softlockup_panic = simple_strtoul(str, NULL, 0);
-
return 1;
}
__setup("softlockup_panic=", softlockup_panic_setup);
static int __init nowatchdog_setup(char *str)
{
- watchdog_enabled = 0;
+ watchdog_user_enabled = 0;
return 1;
}
__setup("nowatchdog", nowatchdog_setup);
static int __init nosoftlockup_setup(char *str)
{
- watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
+ soft_watchdog_user_enabled = 0;
return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);
#ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+
static int __init softlockup_all_cpu_backtrace_setup(char *str)
{
- sysctl_softlockup_all_cpu_backtrace =
- !!simple_strtol(str, NULL, 0);
+ sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
return 1;
}
__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int __init hardlockup_all_cpu_backtrace_setup(char *str)
-{
- sysctl_hardlockup_all_cpu_backtrace =
- !!simple_strtol(str, NULL, 0);
- return 1;
-}
-__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
-#endif
#endif
+static void __lockup_detector_cleanup(void);
+
/*
* Hard-lockup warnings should be triggered after just a few seconds. Soft-
* lockups can have false positives under extreme conditions. So we generally
@@ -278,11 +285,15 @@ void touch_all_softlockup_watchdogs(void)
int cpu;
/*
- * this is done lockless
- * do we care if a 0 races with a timestamp?
- * all it means is the softlock check starts one cycle later
+ * watchdog_mutex cannpt be taken here, as this might be called
+ * from (soft)interrupt context, so the access to
+ * watchdog_allowed_cpumask might race with a concurrent update.
+ *
+ * The watchdog time stamp can race against a concurrent real
+ * update as well, the only side effect might be a cycle delay for
+ * the softlockup check.
*/
- for_each_watchdog_cpu(cpu)
+ for_each_cpu(cpu, &watchdog_allowed_mask)
per_cpu(watchdog_touch_ts, cpu) = 0;
wq_watchdog_touch(-1);
}
@@ -322,9 +333,6 @@ static void watchdog_interrupt_count(void)
__this_cpu_inc(hrtimer_interrupts);
}
-static int watchdog_enable_all_cpus(void);
-static void watchdog_disable_all_cpus(void);
-
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
@@ -333,7 +341,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
- if (atomic_read(&watchdog_park_in_progress) != 0)
+ if (!watchdog_enabled)
return HRTIMER_NORESTART;
/* kick the hardlockup detector */
@@ -447,32 +455,38 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
static void watchdog_enable(unsigned int cpu)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+ struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
- /* kick off the timer for the hardlockup detector */
+ /*
+ * Start the timer first to prevent the NMI watchdog triggering
+ * before the timer has a chance to fire.
+ */
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = watchdog_timer_fn;
-
- /* Enable the perf event */
- watchdog_nmi_enable(cpu);
-
- /* done here because hrtimer_start can only pin to smp_processor_id() */
hrtimer_start(hrtimer, ns_to_ktime(sample_period),
HRTIMER_MODE_REL_PINNED);
- /* initialize timestamp */
- watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
+ /* Initialize timestamp */
__touch_watchdog();
+ /* Enable the perf event */
+ if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
+ watchdog_nmi_enable(cpu);
+
+ watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
}
static void watchdog_disable(unsigned int cpu)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+ struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
watchdog_set_prio(SCHED_NORMAL, 0);
- hrtimer_cancel(hrtimer);
- /* disable the perf event */
+ /*
+ * Disable the perf event first. That prevents that a large delay
+ * between disabling the timer and disabling the perf event causes
+ * the perf NMI to detect a false positive.
+ */
watchdog_nmi_disable(cpu);
+ hrtimer_cancel(hrtimer);
}
static void watchdog_cleanup(unsigned int cpu, bool online)
@@ -499,21 +513,6 @@ static void watchdog(unsigned int cpu)
__this_cpu_write(soft_lockup_hrtimer_cnt,
__this_cpu_read(hrtimer_interrupts));
__touch_watchdog();
-
- /*
- * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
- * failure path. Check for failures that can occur asynchronously -
- * for example, when CPUs are on-lined - and shut down the hardware
- * perf event on each CPU accordingly.
- *
- * The only non-obvious place this bit can be cleared is through
- * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a
- * pr_info here would be too noisy as it would result in a message
- * every few seconds if the hardlockup was disabled but the softlockup
- * enabled.
- */
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
- watchdog_nmi_disable(cpu);
}
static struct smp_hotplug_thread watchdog_threads = {
@@ -527,295 +526,174 @@ static struct smp_hotplug_thread watchdog_threads = {
.unpark = watchdog_enable,
};
-/*
- * park all watchdog threads that are specified in 'watchdog_cpumask'
- *
- * This function returns an error if kthread_park() of a watchdog thread
- * fails. In this situation, the watchdog threads of some CPUs can already
- * be parked and the watchdog threads of other CPUs can still be runnable.
- * Callers are expected to handle this special condition as appropriate in
- * their context.
- *
- * This function may only be called in a context that is protected against
- * races with CPU hotplug - for example, via get_online_cpus().
- */
-static int watchdog_park_threads(void)
+static void softlockup_update_smpboot_threads(void)
{
- int cpu, ret = 0;
+ lockdep_assert_held(&watchdog_mutex);
- atomic_set(&watchdog_park_in_progress, 1);
+ if (!softlockup_threads_initialized)
+ return;
- for_each_watchdog_cpu(cpu) {
- ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
- if (ret)
- break;
- }
-
- atomic_set(&watchdog_park_in_progress, 0);
-
- return ret;
+ smpboot_update_cpumask_percpu_thread(&watchdog_threads,
+ &watchdog_allowed_mask);
}
-/*
- * unpark all watchdog threads that are specified in 'watchdog_cpumask'
- *
- * This function may only be called in a context that is protected against
- * races with CPU hotplug - for example, via get_online_cpus().
- */
-static void watchdog_unpark_threads(void)
+/* Temporarily park all watchdog threads */
+static void softlockup_park_all_threads(void)
{
- int cpu;
-
- for_each_watchdog_cpu(cpu)
- kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+ cpumask_clear(&watchdog_allowed_mask);
+ softlockup_update_smpboot_threads();
}
-static int update_watchdog_all_cpus(void)
+/* Unpark enabled threads */
+static void softlockup_unpark_threads(void)
{
- int ret;
-
- ret = watchdog_park_threads();
- if (ret)
- return ret;
-
- watchdog_unpark_threads();
-
- return 0;
+ cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
+ softlockup_update_smpboot_threads();
}
-static int watchdog_enable_all_cpus(void)
+static void lockup_detector_reconfigure(void)
{
- int err = 0;
-
- if (!watchdog_running) {
- err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
- &watchdog_cpumask);
- if (err)
- pr_err("Failed to create watchdog threads, disabled\n");
- else
- watchdog_running = 1;
- } else {
- /*
- * Enable/disable the lockup detectors or
- * change the sample period 'on the fly'.
- */
- err = update_watchdog_all_cpus();
-
- if (err) {
- watchdog_disable_all_cpus();
- pr_err("Failed to update lockup detectors, disabled\n");
- }
- }
-
- if (err)
- watchdog_enabled = 0;
-
- return err;
+ cpus_read_lock();
+ watchdog_nmi_stop();
+ softlockup_park_all_threads();
+ set_sample_period();
+ lockup_detector_update_enable();
+ if (watchdog_enabled && watchdog_thresh)
+ softlockup_unpark_threads();
+ watchdog_nmi_start();
+ cpus_read_unlock();
+ /*
+ * Must be called outside the cpus locked section to prevent
+ * recursive locking in the perf code.
+ */
+ __lockup_detector_cleanup();
}
-static void watchdog_disable_all_cpus(void)
+/*
+ * Create the watchdog thread infrastructure and configure the detector(s).
+ *
+ * The threads are not unparked as watchdog_allowed_mask is empty. When
+ * the threads are sucessfully initialized, take the proper locks and
+ * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
+ */
+static __init void lockup_detector_setup(void)
{
- if (watchdog_running) {
- watchdog_running = 0;
- smpboot_unregister_percpu_thread(&watchdog_threads);
- }
-}
+ int ret;
-#ifdef CONFIG_SYSCTL
-static int watchdog_update_cpus(void)
-{
- return smpboot_update_cpumask_percpu_thread(
- &watchdog_threads, &watchdog_cpumask);
-}
-#endif
+ /*
+ * If sysctl is off and watchdog got disabled on the command line,
+ * nothing to do here.
+ */
+ lockup_detector_update_enable();
-#else /* SOFTLOCKUP */
-static int watchdog_park_threads(void)
-{
- return 0;
-}
+ if (!IS_ENABLED(CONFIG_SYSCTL) &&
+ !(watchdog_enabled && watchdog_thresh))
+ return;
-static void watchdog_unpark_threads(void)
-{
-}
+ ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+ &watchdog_allowed_mask);
+ if (ret) {
+ pr_err("Failed to initialize soft lockup detector threads\n");
+ return;
+ }
-static int watchdog_enable_all_cpus(void)
-{
- return 0;
+ mutex_lock(&watchdog_mutex);
+ softlockup_threads_initialized = true;
+ lockup_detector_reconfigure();
+ mutex_unlock(&watchdog_mutex);
}
-static void watchdog_disable_all_cpus(void)
+#else /* CONFIG_SOFTLOCKUP_DETECTOR */
+static inline int watchdog_park_threads(void) { return 0; }
+static inline void watchdog_unpark_threads(void) { }
+static inline int watchdog_enable_all_cpus(void) { return 0; }
+static inline void watchdog_disable_all_cpus(void) { }
+static void lockup_detector_reconfigure(void)
{
+ cpus_read_lock();
+ watchdog_nmi_stop();
+ lockup_detector_update_enable();
+ watchdog_nmi_start();
+ cpus_read_unlock();
}
-
-#ifdef CONFIG_SYSCTL
-static int watchdog_update_cpus(void)
+static inline void lockup_detector_setup(void)
{
- return 0;
+ lockup_detector_reconfigure();
}
-#endif
+#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
-static void set_sample_period(void)
+static void __lockup_detector_cleanup(void)
{
+ lockdep_assert_held(&watchdog_mutex);
+ hardlockup_detector_perf_cleanup();
}
-#endif /* SOFTLOCKUP */
-/*
- * Suspend the hard and soft lockup detector by parking the watchdog threads.
+/**
+ * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes
+ *
+ * Caller must not hold the cpu hotplug rwsem.
*/
-int lockup_detector_suspend(void)
+void lockup_detector_cleanup(void)
{
- int ret = 0;
-
- get_online_cpus();
- mutex_lock(&watchdog_proc_mutex);
- /*
- * Multiple suspend requests can be active in parallel (counted by
- * the 'watchdog_suspended' variable). If the watchdog threads are
- * running, the first caller takes care that they will be parked.
- * The state of 'watchdog_running' cannot change while a suspend
- * request is active (see related code in 'proc' handlers).
- */
- if (watchdog_running && !watchdog_suspended)
- ret = watchdog_park_threads();
-
- if (ret == 0)
- watchdog_suspended++;
- else {
- watchdog_disable_all_cpus();
- pr_err("Failed to suspend lockup detectors, disabled\n");
- watchdog_enabled = 0;
- }
-
- watchdog_nmi_reconfigure();
-
- mutex_unlock(&watchdog_proc_mutex);
-
- return ret;
+ mutex_lock(&watchdog_mutex);
+ __lockup_detector_cleanup();
+ mutex_unlock(&watchdog_mutex);
}
-/*
- * Resume the hard and soft lockup detector by unparking the watchdog threads.
+/**
+ * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
+ *
+ * Special interface for parisc. It prevents lockup detector warnings from
+ * the default pm_poweroff() function which busy loops forever.
*/
-void lockup_detector_resume(void)
+void lockup_detector_soft_poweroff(void)
{
- mutex_lock(&watchdog_proc_mutex);
-
- watchdog_suspended--;
- /*
- * The watchdog threads are unparked if they were previously running
- * and if there is no more active suspend request.
- */
- if (watchdog_running && !watchdog_suspended)
- watchdog_unpark_threads();
-
- watchdog_nmi_reconfigure();
-
- mutex_unlock(&watchdog_proc_mutex);
- put_online_cpus();
+ watchdog_enabled = 0;
}
#ifdef CONFIG_SYSCTL
-/*
- * Update the run state of the lockup detectors.
- */
-static int proc_watchdog_update(void)
+/* Propagate any changes to the watchdog threads */
+static void proc_watchdog_update(void)
{
- int err = 0;
-
- /*
- * Watchdog threads won't be started if they are already active.
- * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
- * care of this. If those threads are already active, the sample
- * period will be updated and the lockup detectors will be enabled
- * or disabled 'on the fly'.
- */
- if (watchdog_enabled && watchdog_thresh)
- err = watchdog_enable_all_cpus();
- else
- watchdog_disable_all_cpus();
-
- watchdog_nmi_reconfigure();
-
- return err;
-
+ /* Remove impossible cpus to keep sysctl output clean. */
+ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
+ lockup_detector_reconfigure();
}
/*
* common function for watchdog, nmi_watchdog and soft_watchdog parameter
*
- * caller | table->data points to | 'which' contains the flag(s)
- * -------------------|-----------------------|-----------------------------
- * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
- * | | with SOFT_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
- * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
- * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
+ * caller | table->data points to | 'which'
+ * -------------------|----------------------------|--------------------------
+ * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED |
+ * | | SOFT_WATCHDOG_ENABLED
+ * -------------------|----------------------------|--------------------------
+ * proc_nmi_watchdog | nmi_watchdog_user_enabled | NMI_WATCHDOG_ENABLED
+ * -------------------|----------------------------|--------------------------
+ * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
*/
static int proc_watchdog_common(int which, struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int err, old, new;
- int *watchdog_param = (int *)table->data;
+ int err, old, *param = table->data;
- get_online_cpus();
- mutex_lock(&watchdog_proc_mutex);
+ mutex_lock(&watchdog_mutex);
- if (watchdog_suspended) {
- /* no parameter changes allowed while watchdog is suspended */
- err = -EAGAIN;
- goto out;
- }
-
- /*
- * If the parameter is being read return the state of the corresponding
- * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
- * run state of the lockup detectors.
- */
if (!write) {
- *watchdog_param = (watchdog_enabled & which) != 0;
+ /*
+ * On read synchronize the userspace interface. This is a
+ * racy snapshot.
+ */
+ *param = (watchdog_enabled & which) != 0;
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
} else {
+ old = READ_ONCE(*param);
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (err)
- goto out;
-
- /*
- * There is a race window between fetching the current value
- * from 'watchdog_enabled' and storing the new value. During
- * this race window, watchdog_nmi_enable() can sneak in and
- * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
- * The 'cmpxchg' detects this race and the loop retries.
- */
- do {
- old = watchdog_enabled;
- /*
- * If the parameter value is not zero set the
- * corresponding bit(s), else clear it(them).
- */
- if (*watchdog_param)
- new = old | which;
- else
- new = old & ~which;
- } while (cmpxchg(&watchdog_enabled, old, new) != old);
-
- /*
- * Update the run state of the lockup detectors. There is _no_
- * need to check the value returned by proc_watchdog_update()
- * and to restore the previous value of 'watchdog_enabled' as
- * both lockup detectors are disabled if proc_watchdog_update()
- * returns an error.
- */
- if (old == new)
- goto out;
-
- err = proc_watchdog_update();
+ if (!err && old != READ_ONCE(*param))
+ proc_watchdog_update();
}
-out:
- mutex_unlock(&watchdog_proc_mutex);
- put_online_cpus();
+ mutex_unlock(&watchdog_mutex);
return err;
}
@@ -835,6 +713,8 @@ int proc_watchdog(struct ctl_table *table, int write,
int proc_nmi_watchdog(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
+ if (!nmi_watchdog_available && write)
+ return -ENOTSUPP;
return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
table, write, buffer, lenp, ppos);
}
@@ -855,39 +735,17 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
int proc_watchdog_thresh(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int err, old, new;
-
- get_online_cpus();
- mutex_lock(&watchdog_proc_mutex);
+ int err, old;
- if (watchdog_suspended) {
- /* no parameter changes allowed while watchdog is suspended */
- err = -EAGAIN;
- goto out;
- }
+ mutex_lock(&watchdog_mutex);
- old = ACCESS_ONCE(watchdog_thresh);
+ old = READ_ONCE(watchdog_thresh);
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (err || !write)
- goto out;
-
- /*
- * Update the sample period. Restore on failure.
- */
- new = ACCESS_ONCE(watchdog_thresh);
- if (old == new)
- goto out;
+ if (!err && write && old != READ_ONCE(watchdog_thresh))
+ proc_watchdog_update();
- set_sample_period();
- err = proc_watchdog_update();
- if (err) {
- watchdog_thresh = old;
- set_sample_period();
- }
-out:
- mutex_unlock(&watchdog_proc_mutex);
- put_online_cpus();
+ mutex_unlock(&watchdog_mutex);
return err;
}
@@ -902,45 +760,19 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
{
int err;
- get_online_cpus();
- mutex_lock(&watchdog_proc_mutex);
-
- if (watchdog_suspended) {
- /* no parameter changes allowed while watchdog is suspended */
- err = -EAGAIN;
- goto out;
- }
+ mutex_lock(&watchdog_mutex);
err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
- if (!err && write) {
- /* Remove impossible cpus to keep sysctl output cleaner. */
- cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
- cpu_possible_mask);
-
- if (watchdog_running) {
- /*
- * Failure would be due to being unable to allocate
- * a temporary cpumask, so we are likely not in a
- * position to do much else to make things better.
- */
- if (watchdog_update_cpus() != 0)
- pr_err("cpumask update failed\n");
- }
+ if (!err && write)
+ proc_watchdog_update();
- watchdog_nmi_reconfigure();
- }
-out:
- mutex_unlock(&watchdog_proc_mutex);
- put_online_cpus();
+ mutex_unlock(&watchdog_mutex);
return err;
}
-
#endif /* CONFIG_SYSCTL */
void __init lockup_detector_init(void)
{
- set_sample_period();
-
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_enabled()) {
pr_info("Disabling watchdog on nohz_full cores by default\n");
@@ -951,6 +783,7 @@ void __init lockup_detector_init(void)
cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
#endif
- if (watchdog_enabled)
- watchdog_enable_all_cpus();
+ if (!watchdog_nmi_probe())
+ nmi_watchdog_available = true;
+ lockup_detector_setup();
}
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 3a09ea1b1d3d..71a62ceacdc8 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -21,8 +21,10 @@
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+static struct cpumask dead_events_mask;
static unsigned long hardlockup_allcpu_dumped;
+static unsigned int watchdog_cpus;
void arch_touch_nmi_watchdog(void)
{
@@ -103,15 +105,12 @@ static struct perf_event_attr wd_hw_attr = {
/* Callback function for perf event subsystem */
static void watchdog_overflow_callback(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
- if (atomic_read(&watchdog_park_in_progress) != 0)
- return;
-
if (__this_cpu_read(watchdog_nmi_touch) == true) {
__this_cpu_write(watchdog_nmi_touch, false);
return;
@@ -160,104 +159,131 @@ static void watchdog_overflow_callback(struct perf_event *event,
return;
}
-/*
- * People like the simple clean cpu node info on boot.
- * Reduce the watchdog noise by only printing messages
- * that are different from what cpu0 displayed.
- */
-static unsigned long firstcpu_err;
-static atomic_t watchdog_cpus;
-
-int watchdog_nmi_enable(unsigned int cpu)
+static int hardlockup_detector_event_create(void)
{
+ unsigned int cpu = smp_processor_id();
struct perf_event_attr *wd_attr;
- struct perf_event *event = per_cpu(watchdog_ev, cpu);
- int firstcpu = 0;
-
- /* nothing to do if the hard lockup detector is disabled */
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
- goto out;
-
- /* is it already setup and enabled? */
- if (event && event->state > PERF_EVENT_STATE_OFF)
- goto out;
-
- /* it is setup but not enabled */
- if (event != NULL)
- goto out_enable;
-
- if (atomic_inc_return(&watchdog_cpus) == 1)
- firstcpu = 1;
+ struct perf_event *evt;
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
/* Try to register using hardware perf events */
- event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+ evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+ watchdog_overflow_callback, NULL);
+ if (IS_ERR(evt)) {
+ pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
+ PTR_ERR(evt));
+ return PTR_ERR(evt);
+ }
+ this_cpu_write(watchdog_ev, evt);
+ return 0;
+}
- /* save the first cpu's error for future comparision */
- if (firstcpu && IS_ERR(event))
- firstcpu_err = PTR_ERR(event);
+/**
+ * hardlockup_detector_perf_enable - Enable the local event
+ */
+void hardlockup_detector_perf_enable(void)
+{
+ if (hardlockup_detector_event_create())
+ return;
- if (!IS_ERR(event)) {
- /* only print for the first cpu initialized */
- if (firstcpu || firstcpu_err)
- pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
- goto out_save;
- }
+ if (!watchdog_cpus++)
+ pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
- /*
- * Disable the hard lockup detector if _any_ CPU fails to set up
- * set up the hardware perf event. The watchdog() function checks
- * the NMI_WATCHDOG_ENABLED bit periodically.
- *
- * The barriers are for syncing up watchdog_enabled across all the
- * cpus, as clear_bit() does not use barriers.
- */
- smp_mb__before_atomic();
- clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
- smp_mb__after_atomic();
-
- /* skip displaying the same error again */
- if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
- return PTR_ERR(event);
-
- /* vary the KERN level based on the returned errno */
- if (PTR_ERR(event) == -EOPNOTSUPP)
- pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
- else if (PTR_ERR(event) == -ENOENT)
- pr_warn("disabled (cpu%i): hardware events not enabled\n",
- cpu);
- else
- pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
- cpu, PTR_ERR(event));
-
- pr_info("Shutting down hard lockup detector on all cpus\n");
-
- return PTR_ERR(event);
-
- /* success path */
-out_save:
- per_cpu(watchdog_ev, cpu) = event;
-out_enable:
- perf_event_enable(per_cpu(watchdog_ev, cpu));
-out:
- return 0;
+ perf_event_enable(this_cpu_read(watchdog_ev));
}
-void watchdog_nmi_disable(unsigned int cpu)
+/**
+ * hardlockup_detector_perf_disable - Disable the local event
+ */
+void hardlockup_detector_perf_disable(void)
{
- struct perf_event *event = per_cpu(watchdog_ev, cpu);
+ struct perf_event *event = this_cpu_read(watchdog_ev);
if (event) {
perf_event_disable(event);
+ cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
+ watchdog_cpus--;
+ }
+}
+
+/**
+ * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
+ *
+ * Called from lockup_detector_cleanup(). Serialized by the caller.
+ */
+void hardlockup_detector_perf_cleanup(void)
+{
+ int cpu;
+
+ for_each_cpu(cpu, &dead_events_mask) {
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ /*
+ * Required because for_each_cpu() reports unconditionally
+ * CPU0 as set on UP kernels. Sigh.
+ */
+ if (event)
+ perf_event_release_kernel(event);
per_cpu(watchdog_ev, cpu) = NULL;
+ }
+ cpumask_clear(&dead_events_mask);
+}
+
+/**
+ * hardlockup_detector_perf_stop - Globally stop watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_stop(void)
+{
+ int cpu;
+
+ lockdep_assert_cpus_held();
+
+ for_each_online_cpu(cpu) {
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ if (event)
+ perf_event_disable(event);
+ }
+}
- /* should be in cleanup, but blocks oprofile */
- perf_event_release_kernel(event);
+/**
+ * hardlockup_detector_perf_restart - Globally restart watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_restart(void)
+{
+ int cpu;
+
+ lockdep_assert_cpus_held();
+
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ return;
+
+ for_each_online_cpu(cpu) {
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ if (event)
+ perf_event_enable(event);
+ }
+}
+
+/**
+ * hardlockup_detector_perf_init - Probe whether NMI event is available at all
+ */
+int __init hardlockup_detector_perf_init(void)
+{
+ int ret = hardlockup_detector_event_create();
- /* watchdog_nmi_enable() expects this to be zero initially. */
- if (atomic_dec_and_test(&watchdog_cpus))
- firstcpu_err = 0;
+ if (ret) {
+ pr_info("Perf NMI watchdog permanently disabled\n");
+ } else {
+ perf_event_release_kernel(this_cpu_read(watchdog_ev));
+ this_cpu_write(watchdog_ev, NULL);
}
+ return ret;
}