diff options
Diffstat (limited to 'samples/bpf')
30 files changed, 1974 insertions, 34 deletions
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 502c9fc8db85..90ebf7d35c07 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -14,11 +14,16 @@ hostprogs-y += tracex3 hostprogs-y += tracex4 hostprogs-y += tracex5 hostprogs-y += tracex6 +hostprogs-y += test_probe_write_user hostprogs-y += trace_output hostprogs-y += lathist hostprogs-y += offwaketime hostprogs-y += spintest hostprogs-y += map_perf_test +hostprogs-y += test_overhead +hostprogs-y += test_cgrp2_array_pin +hostprogs-y += xdp1 +hostprogs-y += xdp2 test_verifier-objs := test_verifier.o libbpf.o test_maps-objs := test_maps.o libbpf.o @@ -33,11 +38,17 @@ tracex3-objs := bpf_load.o libbpf.o tracex3_user.o tracex4-objs := bpf_load.o libbpf.o tracex4_user.o tracex5-objs := bpf_load.o libbpf.o tracex5_user.o tracex6-objs := bpf_load.o libbpf.o tracex6_user.o +test_probe_write_user-objs := bpf_load.o libbpf.o test_probe_write_user_user.o trace_output-objs := bpf_load.o libbpf.o trace_output_user.o lathist-objs := bpf_load.o libbpf.o lathist_user.o offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o spintest-objs := bpf_load.o libbpf.o spintest_user.o map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o +test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o +test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o +xdp1-objs := bpf_load.o libbpf.o xdp1_user.o +# reuse xdp1 source intentionally +xdp2-objs := bpf_load.o libbpf.o xdp1_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -50,12 +61,19 @@ always += tracex3_kern.o always += tracex4_kern.o always += tracex5_kern.o always += tracex6_kern.o +always += test_probe_write_user_kern.o always += trace_output_kern.o always += tcbpf1_kern.o always += lathist_kern.o always += offwaketime_kern.o always += spintest_kern.o always += map_perf_test_kern.o +always += test_overhead_tp_kern.o +always += test_overhead_kprobe_kern.o +always += parse_varlen.o parse_simple.o parse_ldabs.o +always += test_cgrp2_tc_kern.o +always += xdp1_kern.o +always += xdp2_kern.o HOSTCFLAGS += -I$(objtree)/usr/include @@ -70,22 +88,54 @@ HOSTLOADLIBES_tracex3 += -lelf HOSTLOADLIBES_tracex4 += -lelf -lrt HOSTLOADLIBES_tracex5 += -lelf HOSTLOADLIBES_tracex6 += -lelf +HOSTLOADLIBES_test_probe_write_user += -lelf HOSTLOADLIBES_trace_output += -lelf -lrt HOSTLOADLIBES_lathist += -lelf HOSTLOADLIBES_offwaketime += -lelf HOSTLOADLIBES_spintest += -lelf HOSTLOADLIBES_map_perf_test += -lelf -lrt +HOSTLOADLIBES_test_overhead += -lelf -lrt +HOSTLOADLIBES_xdp1 += -lelf +HOSTLOADLIBES_xdp2 += -lelf -# point this to your LLVM backend with bpf support -LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc +# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: +# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang +LLC ?= llc +CLANG ?= clang -# asm/sysreg.h inline assmbly used by it is incompatible with llvm. -# But, ehere is not easy way to fix it, so just exclude it since it is +# Trick to allow make to be run from this directory +all: + $(MAKE) -C ../../ $$PWD/ + +clean: + $(MAKE) -C ../../ M=$$PWD clean + @rm -f *~ + +# Verify LLVM compiler tools are available and bpf target is supported by llc +.PHONY: verify_cmds verify_target_bpf $(CLANG) $(LLC) + +verify_cmds: $(CLANG) $(LLC) + @for TOOL in $^ ; do \ + if ! (which -- "$${TOOL}" > /dev/null 2>&1); then \ + echo "*** ERROR: Cannot find LLVM tool $${TOOL}" ;\ + exit 1; \ + else true; fi; \ + done + +verify_target_bpf: verify_cmds + @if ! (${LLC} -march=bpf -mattr=help > /dev/null 2>&1); then \ + echo "*** ERROR: LLVM (${LLC}) does not support 'bpf' target" ;\ + echo " NOTICE: LLVM version >= 3.7.1 required" ;\ + exit 2; \ + else true; fi + +$(src)/*.c: verify_target_bpf + +# asm/sysreg.h - inline assembly used by it is incompatible with llvm. +# But, there is no easy way to fix it, so just exclude it since it is # useless for BPF samples. $(obj)/%.o: $(src)/%.c - clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \ + $(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \ -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \ + -Wno-compare-distinct-pointer-types \ -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@ - clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \ - -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \ - -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=asm -o $@.s diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst new file mode 100644 index 000000000000..a43eae3f0551 --- /dev/null +++ b/samples/bpf/README.rst @@ -0,0 +1,66 @@ +eBPF sample programs +==================== + +This directory contains a mini eBPF library, test stubs, verifier +test-suite and examples for using eBPF. + +Build dependencies +================== + +Compiling requires having installed: + * clang >= version 3.4.0 + * llvm >= version 3.7.1 + +Note that LLVM's tool 'llc' must support target 'bpf', list version +and supported targets with command: ``llc --version`` + +Kernel headers +-------------- + +There are usually dependencies to header files of the current kernel. +To avoid installing devel kernel headers system wide, as a normal +user, simply call:: + + make headers_install + +This will creates a local "usr/include" directory in the git/build top +level directory, that the make system automatically pickup first. + +Compiling +========= + +For building the BPF samples, issue the below command from the kernel +top level directory:: + + make samples/bpf/ + +Do notice the "/" slash after the directory name. + +It is also possible to call make from this directory. This will just +hide the the invocation of make as above with the appended "/". + +Manually compiling LLVM with 'bpf' support +------------------------------------------ + +Since version 3.7.0, LLVM adds a proper LLVM backend target for the +BPF bytecode architecture. + +By default llvm will build all non-experimental backends including bpf. +To generate a smaller llc binary one can use:: + + -DLLVM_TARGETS_TO_BUILD="BPF" + +Quick sniplet for manually compiling LLVM and clang +(build dependencies are cmake and gcc-c++):: + + $ git clone http://llvm.org/git/llvm.git + $ cd llvm/tools + $ git clone --depth 1 http://llvm.org/git/clang.git + $ cd ..; mkdir build; cd build + $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" + $ make -j $(getconf _NPROCESSORS_ONLN) + +It is also possible to point make to the newly compiled 'llc' or +'clang' command via redefining LLC or CLANG on the make command line:: + + make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index 9363500131a7..217c8d507f2e 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -41,6 +41,8 @@ static int (*bpf_perf_event_output)(void *ctx, void *map, int index, void *data, (void *) BPF_FUNC_perf_event_output; static int (*bpf_get_stackid)(void *ctx, void *map, int flags) = (void *) BPF_FUNC_get_stackid; +static int (*bpf_probe_write_user)(void *dst, void *src, int size) = + (void *) BPF_FUNC_probe_write_user; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions @@ -70,6 +72,8 @@ static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flag (void *) BPF_FUNC_l3_csum_replace; static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) = (void *) BPF_FUNC_l4_csum_replace; +static int (*bpf_skb_in_cgroup)(void *ctx, void *map, int index) = + (void *) BPF_FUNC_skb_in_cgroup; #if defined(__x86_64__) @@ -82,6 +86,7 @@ static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flag #define PT_REGS_FP(x) ((x)->bp) #define PT_REGS_RC(x) ((x)->ax) #define PT_REGS_SP(x) ((x)->sp) +#define PT_REGS_IP(x) ((x)->ip) #elif defined(__s390x__) @@ -94,6 +99,7 @@ static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flag #define PT_REGS_FP(x) ((x)->gprs[11]) /* Works only with CONFIG_FRAME_POINTER */ #define PT_REGS_RC(x) ((x)->gprs[2]) #define PT_REGS_SP(x) ((x)->gprs[15]) +#define PT_REGS_IP(x) ((x)->ip) #elif defined(__aarch64__) @@ -106,6 +112,30 @@ static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flag #define PT_REGS_FP(x) ((x)->regs[29]) /* Works only with CONFIG_FRAME_POINTER */ #define PT_REGS_RC(x) ((x)->regs[0]) #define PT_REGS_SP(x) ((x)->sp) +#define PT_REGS_IP(x) ((x)->pc) + +#elif defined(__powerpc__) + +#define PT_REGS_PARM1(x) ((x)->gpr[3]) +#define PT_REGS_PARM2(x) ((x)->gpr[4]) +#define PT_REGS_PARM3(x) ((x)->gpr[5]) +#define PT_REGS_PARM4(x) ((x)->gpr[6]) +#define PT_REGS_PARM5(x) ((x)->gpr[7]) +#define PT_REGS_RC(x) ((x)->gpr[3]) +#define PT_REGS_SP(x) ((x)->sp) +#define PT_REGS_IP(x) ((x)->nip) #endif + +#ifdef __powerpc__ +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) +#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP +#else +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ \ + bpf_probe_read(&(ip), sizeof(ip), (void *)PT_REGS_RET(ctx)); }) +#define BPF_KRETPROBE_READ_RET_IP(ip, ctx) ({ \ + bpf_probe_read(&(ip), sizeof(ip), \ + (void *)(PT_REGS_FP(ctx) + sizeof(ip))); }) +#endif + #endif diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 58f86bd11b3d..0cfda2320320 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -49,6 +49,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_socket = strncmp(event, "socket", 6) == 0; bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; + bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; + bool is_xdp = strncmp(event, "xdp", 3) == 0; enum bpf_prog_type prog_type; char buf[256]; int fd, efd, err, id; @@ -63,6 +65,10 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_SOCKET_FILTER; } else if (is_kprobe || is_kretprobe) { prog_type = BPF_PROG_TYPE_KPROBE; + } else if (is_tracepoint) { + prog_type = BPF_PROG_TYPE_TRACEPOINT; + } else if (is_xdp) { + prog_type = BPF_PROG_TYPE_XDP; } else { printf("Unknown event '%s'\n", event); return -1; @@ -76,6 +82,9 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_fd[prog_cnt++] = fd; + if (is_xdp) + return 0; + if (is_socket) { event += 6; if (*event != '/') @@ -111,12 +120,23 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) event, strerror(errno)); return -1; } - } - strcpy(buf, DEBUGFS); - strcat(buf, "events/kprobes/"); - strcat(buf, event); - strcat(buf, "/id"); + strcpy(buf, DEBUGFS); + strcat(buf, "events/kprobes/"); + strcat(buf, event); + strcat(buf, "/id"); + } else if (is_tracepoint) { + event += 11; + + if (*event == 0) { + printf("event name cannot be empty\n"); + return -1; + } + strcpy(buf, DEBUGFS); + strcat(buf, "events/"); + strcat(buf, event); + strcat(buf, "/id"); + } efd = open(buf, O_RDONLY, 0); if (efd < 0) { @@ -304,6 +324,8 @@ int load_bpf_file(char *path) if (memcmp(shname_prog, "kprobe/", 7) == 0 || memcmp(shname_prog, "kretprobe/", 10) == 0 || + memcmp(shname_prog, "tracepoint/", 11) == 0 || + memcmp(shname_prog, "xdp", 3) == 0 || memcmp(shname_prog, "socket", 6) == 0) load_and_attach(shname_prog, insns, data_prog->d_size); } @@ -320,6 +342,8 @@ int load_bpf_file(char *path) if (memcmp(shname, "kprobe/", 7) == 0 || memcmp(shname, "kretprobe/", 10) == 0 || + memcmp(shname, "tracepoint/", 11) == 0 || + memcmp(shname, "xdp", 3) == 0 || memcmp(shname, "socket", 6) == 0) load_and_attach(shname, data->d_buf, data->d_size); } diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index 95af56ec5739..3147377e8fd3 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -17,6 +17,7 @@ #include <linux/bpf.h> #include <string.h> #include <time.h> +#include <sys/resource.h> #include "libbpf.h" #include "bpf_load.h" diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c index c0aa5a9b9c48..e7d9a0a3d45b 100644 --- a/samples/bpf/offwaketime_kern.c +++ b/samples/bpf/offwaketime_kern.c @@ -11,7 +11,7 @@ #include <linux/version.h> #include <linux/sched.h> -#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;}) +#define _(P) ({typeof(P) val; bpf_probe_read(&val, sizeof(val), &P); val;}) #define MINBLOCK_US 1 @@ -61,7 +61,7 @@ SEC("kprobe/try_to_wake_up") int waker(struct pt_regs *ctx) { struct task_struct *p = (void *) PT_REGS_PARM1(ctx); - struct wokeby_t woke = {}; + struct wokeby_t woke; u32 pid; pid = _(p->pid); @@ -73,19 +73,21 @@ int waker(struct pt_regs *ctx) return 0; } -static inline int update_counts(struct pt_regs *ctx, u32 pid, u64 delta) +static inline int update_counts(void *ctx, u32 pid, u64 delta) { - struct key_t key = {}; struct wokeby_t *woke; u64 zero = 0, *val; + struct key_t key; + __builtin_memset(&key.waker, 0, sizeof(key.waker)); bpf_get_current_comm(&key.target, sizeof(key.target)); key.tret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS); + key.wret = 0; woke = bpf_map_lookup_elem(&wokeby, &pid); if (woke) { key.wret = woke->ret; - __builtin_memcpy(&key.waker, woke->name, TASK_COMM_LEN); + __builtin_memcpy(&key.waker, woke->name, sizeof(key.waker)); bpf_map_delete_elem(&wokeby, &pid); } @@ -100,15 +102,33 @@ static inline int update_counts(struct pt_regs *ctx, u32 pid, u64 delta) return 0; } +#if 1 +/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */ +struct sched_switch_args { + unsigned long long pad; + char prev_comm[16]; + int prev_pid; + int prev_prio; + long long prev_state; + char next_comm[16]; + int next_pid; + int next_prio; +}; +SEC("tracepoint/sched/sched_switch") +int oncpu(struct sched_switch_args *ctx) +{ + /* record previous thread sleep time */ + u32 pid = ctx->prev_pid; +#else SEC("kprobe/finish_task_switch") int oncpu(struct pt_regs *ctx) { struct task_struct *p = (void *) PT_REGS_PARM1(ctx); + /* record previous thread sleep time */ + u32 pid = _(p->pid); +#endif u64 delta, ts, *tsp; - u32 pid; - /* record previous thread sleep time */ - pid = _(p->pid); ts = bpf_ktime_get_ns(); bpf_map_update_elem(&start, &pid, &ts, BPF_ANY); diff --git a/samples/bpf/parse_ldabs.c b/samples/bpf/parse_ldabs.c new file mode 100644 index 000000000000..d17550198d06 --- /dev/null +++ b/samples/bpf/parse_ldabs.c @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +#define DEFAULT_PKTGEN_UDP_PORT 9 +#define IP_MF 0x2000 +#define IP_OFFSET 0x1FFF + +static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff) +{ + return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off)) + & (IP_MF | IP_OFFSET); +} + +SEC("ldabs") +int handle_ingress(struct __sk_buff *skb) +{ + __u64 troff = ETH_HLEN + sizeof(struct iphdr); + + if (load_half(skb, offsetof(struct ethhdr, h_proto)) != ETH_P_IP) + return 0; + if (load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)) != IPPROTO_UDP || + load_byte(skb, ETH_HLEN) != 0x45) + return 0; + if (ip_is_fragment(skb, ETH_HLEN)) + return 0; + if (load_half(skb, troff + offsetof(struct udphdr, dest)) == DEFAULT_PKTGEN_UDP_PORT) + return TC_ACT_SHOT; + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/parse_simple.c b/samples/bpf/parse_simple.c new file mode 100644 index 000000000000..cf2511c33905 --- /dev/null +++ b/samples/bpf/parse_simple.c @@ -0,0 +1,48 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <uapi/linux/bpf.h> +#include <net/ip.h> +#include "bpf_helpers.h" + +#define DEFAULT_PKTGEN_UDP_PORT 9 + +/* copy of 'struct ethhdr' without __packed */ +struct eth_hdr { + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; + unsigned short h_proto; +}; + +SEC("simple") +int handle_ingress(struct __sk_buff *skb) +{ + void *data = (void *)(long)skb->data; + struct eth_hdr *eth = data; + struct iphdr *iph = data + sizeof(*eth); + struct udphdr *udp = data + sizeof(*eth) + sizeof(*iph); + void *data_end = (void *)(long)skb->data_end; + + /* single length check */ + if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*udp) > data_end) + return 0; + + if (eth->h_proto != htons(ETH_P_IP)) + return 0; + if (iph->protocol != IPPROTO_UDP || iph->ihl != 5) + return 0; + if (ip_is_fragment(iph)) + return 0; + if (udp->dest == htons(DEFAULT_PKTGEN_UDP_PORT)) + return TC_ACT_SHOT; + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/parse_varlen.c b/samples/bpf/parse_varlen.c new file mode 100644 index 000000000000..edab34dce79b --- /dev/null +++ b/samples/bpf/parse_varlen.c @@ -0,0 +1,153 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <uapi/linux/bpf.h> +#include <net/ip.h> +#include "bpf_helpers.h" + +#define DEFAULT_PKTGEN_UDP_PORT 9 +#define DEBUG 0 + +static int tcp(void *data, uint64_t tp_off, void *data_end) +{ + struct tcphdr *tcp = data + tp_off; + + if (tcp + 1 > data_end) + return 0; + if (tcp->dest == htons(80) || tcp->source == htons(80)) + return TC_ACT_SHOT; + return 0; +} + +static int udp(void *data, uint64_t tp_off, void *data_end) +{ + struct udphdr *udp = data + tp_off; + + if (udp + 1 > data_end) + return 0; + if (udp->dest == htons(DEFAULT_PKTGEN_UDP_PORT) || + udp->source == htons(DEFAULT_PKTGEN_UDP_PORT)) { + if (DEBUG) { + char fmt[] = "udp port 9 indeed\n"; + + bpf_trace_printk(fmt, sizeof(fmt)); + } + return TC_ACT_SHOT; + } + return 0; +} + +static int parse_ipv4(void *data, uint64_t nh_off, void *data_end) +{ + struct iphdr *iph; + uint64_t ihl_len; + + iph = data + nh_off; + if (iph + 1 > data_end) + return 0; + + if (ip_is_fragment(iph)) + return 0; + ihl_len = iph->ihl * 4; + + if (iph->protocol == IPPROTO_IPIP) { + iph = data + nh_off + ihl_len; + if (iph + 1 > data_end) + return 0; + ihl_len += iph->ihl * 4; + } + + if (iph->protocol == IPPROTO_TCP) + return tcp(data, nh_off + ihl_len, data_end); + else if (iph->protocol == IPPROTO_UDP) + return udp(data, nh_off + ihl_len, data_end); + return 0; +} + +static int parse_ipv6(void *data, uint64_t nh_off, void *data_end) +{ + struct ipv6hdr *ip6h; + struct iphdr *iph; + uint64_t ihl_len = sizeof(struct ipv6hdr); + uint64_t nexthdr; + + ip6h = data + nh_off; + if (ip6h + 1 > data_end) + return 0; + + nexthdr = ip6h->nexthdr; + + if (nexthdr == IPPROTO_IPIP) { + iph = data + nh_off + ihl_len; + if (iph + 1 > data_end) + return 0; + ihl_len += iph->ihl * 4; + nexthdr = iph->protocol; + } else if (nexthdr == IPPROTO_IPV6) { + ip6h = data + nh_off + ihl_len; + if (ip6h + 1 > data_end) + return 0; + ihl_len += sizeof(struct ipv6hdr); + nexthdr = ip6h->nexthdr; + } + + if (nexthdr == IPPROTO_TCP) + return tcp(data, nh_off + ihl_len, data_end); + else if (nexthdr == IPPROTO_UDP) + return udp(data, nh_off + ihl_len, data_end); + return 0; +} + +struct vlan_hdr { + uint16_t h_vlan_TCI; + uint16_t h_vlan_encapsulated_proto; +}; + +SEC("varlen") +int handle_ingress(struct __sk_buff *skb) +{ + void *data = (void *)(long)skb->data; + struct ethhdr *eth = data; + void *data_end = (void *)(long)skb->data_end; + uint64_t h_proto, nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return 0; + + h_proto = eth->h_proto; + + if (h_proto == ETH_P_8021Q || h_proto == ETH_P_8021AD) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return 0; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + if (h_proto == ETH_P_8021Q || h_proto == ETH_P_8021AD) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return 0; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + if (h_proto == htons(ETH_P_IP)) + return parse_ipv4(data, nh_off, data_end); + else if (h_proto == htons(ETH_P_IPV6)) + return parse_ipv6(data, nh_off, data_end); + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c index 29a276d766fc..8a4085c2d117 100644 --- a/samples/bpf/sockex2_user.c +++ b/samples/bpf/sockex2_user.c @@ -5,6 +5,7 @@ #include "bpf_load.h" #include <unistd.h> #include <arpa/inet.h> +#include <sys/resource.h> struct pair { __u64 packets; @@ -13,11 +14,13 @@ struct pair { int main(int ac, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; char filename[256]; FILE *f; int i, sock; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + setrlimit(RLIMIT_MEMLOCK, &r); if (load_bpf_file(filename)) { printf("%s", bpf_log_buf); diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c index 2617772d060d..d4184ab5f3ac 100644 --- a/samples/bpf/sockex3_user.c +++ b/samples/bpf/sockex3_user.c @@ -5,6 +5,7 @@ #include "bpf_load.h" #include <unistd.h> #include <arpa/inet.h> +#include <sys/resource.h> struct flow_keys { __be32 src; @@ -23,11 +24,13 @@ struct pair { int main(int argc, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; char filename[256]; FILE *f; int i, sock; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + setrlimit(RLIMIT_MEMLOCK, &r); if (load_bpf_file(filename)) { printf("%s", bpf_log_buf); diff --git a/samples/bpf/spintest_kern.c b/samples/bpf/spintest_kern.c index 4b27619d91a4..ce0167d09cdc 100644 --- a/samples/bpf/spintest_kern.c +++ b/samples/bpf/spintest_kern.c @@ -34,7 +34,7 @@ struct bpf_map_def SEC("maps") stackmap = { #define PROG(foo) \ int foo(struct pt_regs *ctx) \ { \ - long v = ctx->ip, *val; \ + long v = PT_REGS_IP(ctx), *val; \ \ val = bpf_map_lookup_elem(&my_map, &v); \ bpf_map_update_elem(&my_map, &v, &v, BPF_ANY); \ diff --git a/samples/bpf/test_cgrp2_array_pin.c b/samples/bpf/test_cgrp2_array_pin.c new file mode 100644 index 000000000000..70e86f7be69d --- /dev/null +++ b/samples/bpf/test_cgrp2_array_pin.c @@ -0,0 +1,109 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/unistd.h> +#include <linux/bpf.h> + +#include <stdio.h> +#include <stdint.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <fcntl.h> + +#include "libbpf.h" + +static void usage(void) +{ + printf("Usage: test_cgrp2_array_pin [...]\n"); + printf(" -F <file> File to pin an BPF cgroup array\n"); + printf(" -U <file> Update an already pinned BPF cgroup array\n"); + printf(" -v <value> Full path of the cgroup2\n"); + printf(" -h Display this help\n"); +} + +int main(int argc, char **argv) +{ + const char *pinned_file = NULL, *cg2 = NULL; + int create_array = 1; + int array_key = 0; + int array_fd = -1; + int cg2_fd = -1; + int ret = -1; + int opt; + + while ((opt = getopt(argc, argv, "F:U:v:")) != -1) { + switch (opt) { + /* General args */ + case 'F': + pinned_file = optarg; + break; + case 'U': + pinned_file = optarg; + create_array = 0; + break; + case 'v': + cg2 = optarg; + break; + default: + usage(); + goto out; + } + } + + if (!cg2 || !pinned_file) { + usage(); + goto out; + } + + cg2_fd = open(cg2, O_RDONLY); + if (cg2_fd < 0) { + fprintf(stderr, "open(%s,...): %s(%d)\n", + cg2, strerror(errno), errno); + goto out; + } + + if (create_array) { + array_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY, + sizeof(uint32_t), sizeof(uint32_t), + 1, 0); + if (array_fd < 0) { + fprintf(stderr, + "bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY,...): %s(%d)\n", + strerror(errno), errno); + goto out; + } + } else { + array_fd = bpf_obj_get(pinned_file); + if (array_fd < 0) { + fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n", + pinned_file, strerror(errno), errno); + goto out; + } + } + + ret = bpf_update_elem(array_fd, &array_key, &cg2_fd, 0); + if (ret) { + perror("bpf_update_elem"); + goto out; + } + + if (create_array) { + ret = bpf_obj_pin(array_fd, pinned_file); + if (ret) { + fprintf(stderr, "bpf_obj_pin(..., %s): %s(%d)\n", + pinned_file, strerror(errno), errno); + goto out; + } + } + +out: + if (array_fd != -1) + close(array_fd); + if (cg2_fd != -1) + close(cg2_fd); + return ret; +} diff --git a/samples/bpf/test_cgrp2_tc.sh b/samples/bpf/test_cgrp2_tc.sh new file mode 100755 index 000000000000..0b119eeaf85c --- /dev/null +++ b/samples/bpf/test_cgrp2_tc.sh @@ -0,0 +1,184 @@ +#!/bin/bash + +MY_DIR=$(dirname $0) +# Details on the bpf prog +BPF_CGRP2_ARRAY_NAME='test_cgrp2_array_pin' +BPF_PROG="$MY_DIR/test_cgrp2_tc_kern.o" +BPF_SECTION='filter' + +[ -z "$TC" ] && TC='tc' +[ -z "$IP" ] && IP='ip' + +# Names of the veth interface, net namespace...etc. +HOST_IFC='ve' +NS_IFC='vens' +NS='ns' + +find_mnt() { + cat /proc/mounts | \ + awk '{ if ($3 == "'$1'" && mnt == "") { mnt = $2 }} END { print mnt }' +} + +# Init cgroup2 vars +init_cgrp2_vars() { + CGRP2_ROOT=$(find_mnt cgroup2) + if [ -z "$CGRP2_ROOT" ] + then + CGRP2_ROOT='/mnt/cgroup2' + MOUNT_CGRP2="yes" + fi + CGRP2_TC="$CGRP2_ROOT/tc" + CGRP2_TC_LEAF="$CGRP2_TC/leaf" +} + +# Init bpf fs vars +init_bpf_fs_vars() { + local bpf_fs_root=$(find_mnt bpf) + [ -n "$bpf_fs_root" ] || return -1 + BPF_FS_TC_SHARE="$bpf_fs_root/tc/globals" +} + +setup_cgrp2() { + case $1 in + start) + if [ "$MOUNT_CGRP2" == 'yes' ] + then + [ -d $CGRP2_ROOT ] || mkdir -p $CGRP2_ROOT + mount -t cgroup2 none $CGRP2_ROOT || return $? + fi + mkdir -p $CGRP2_TC_LEAF + ;; + *) + rmdir $CGRP2_TC_LEAF && rmdir $CGRP2_TC + [ "$MOUNT_CGRP2" == 'yes' ] && umount $CGRP2_ROOT + ;; + esac +} + +setup_bpf_cgrp2_array() { + local bpf_cgrp2_array="$BPF_FS_TC_SHARE/$BPF_CGRP2_ARRAY_NAME" + case $1 in + start) + $MY_DIR/test_cgrp2_array_pin -U $bpf_cgrp2_array -v $CGRP2_TC + ;; + *) + [ -d "$BPF_FS_TC_SHARE" ] && rm -f $bpf_cgrp2_array + ;; + esac +} + +setup_net() { + case $1 in + start) + $IP link add $HOST_IFC type veth peer name $NS_IFC || return $? + $IP link set dev $HOST_IFC up || return $? + sysctl -q net.ipv6.conf.$HOST_IFC.accept_dad=0 + + $IP netns add ns || return $? + $IP link set dev $NS_IFC netns ns || return $? + $IP -n $NS link set dev $NS_IFC up || return $? + $IP netns exec $NS sysctl -q net.ipv6.conf.$NS_IFC.accept_dad=0 + $TC qdisc add dev $HOST_IFC clsact || return $? + $TC filter add dev $HOST_IFC egress bpf da obj $BPF_PROG sec $BPF_SECTION || return $? + ;; + *) + $IP netns del $NS + $IP link del $HOST_IFC + ;; + esac +} + +run_in_cgrp() { + # Fork another bash and move it under the specified cgroup. + # It makes the cgroup cleanup easier at the end of the test. + cmd='echo $$ > ' + cmd="$cmd $1/cgroup.procs; exec $2" + bash -c "$cmd" +} + +do_test() { + run_in_cgrp $CGRP2_TC_LEAF "ping -6 -c3 ff02::1%$HOST_IFC >& /dev/null" + local dropped=$($TC -s qdisc show dev $HOST_IFC | tail -3 | \ + awk '/drop/{print substr($7, 0, index($7, ",")-1)}') + if [[ $dropped -eq 0 ]] + then + echo "FAIL" + return 1 + else + echo "Successfully filtered $dropped packets" + return 0 + fi +} + +do_exit() { + if [ "$DEBUG" == "yes" ] && [ "$MODE" != 'cleanuponly' ] + then + echo "------ DEBUG ------" + echo "mount: "; mount | egrep '(cgroup2|bpf)'; echo + echo "$CGRP2_TC_LEAF: "; ls -l $CGRP2_TC_LEAF; echo + if [ -d "$BPF_FS_TC_SHARE" ] + then + echo "$BPF_FS_TC_SHARE: "; ls -l $BPF_FS_TC_SHARE; echo + fi + echo "Host net:" + $IP netns + $IP link show dev $HOST_IFC + $IP -6 a show dev $HOST_IFC + $TC -s qdisc show dev $HOST_IFC + echo + echo "$NS net:" + $IP -n $NS link show dev $NS_IFC + $IP -n $NS -6 link show dev $NS_IFC + echo "------ DEBUG ------" + echo + fi + + if [ "$MODE" != 'nocleanup' ] + then + setup_net stop + setup_bpf_cgrp2_array stop + setup_cgrp2 stop + fi +} + +init_cgrp2_vars +init_bpf_fs_vars + +while [[ $# -ge 1 ]] +do + a="$1" + case $a in + debug) + DEBUG='yes' + shift 1 + ;; + cleanup-only) + MODE='cleanuponly' + shift 1 + ;; + no-cleanup) + MODE='nocleanup' + shift 1 + ;; + *) + echo "test_cgrp2_tc [debug] [cleanup-only | no-cleanup]" + echo " debug: Print cgrp and network setup details at the end of the test" + echo " cleanup-only: Try to cleanup things from last test. No test will be run" + echo " no-cleanup: Run the test but don't do cleanup at the end" + echo "[Note: If no arg is given, it will run the test and do cleanup at the end]" + echo + exit -1 + ;; + esac +done + +trap do_exit 0 + +[ "$MODE" == 'cleanuponly' ] && exit + +setup_cgrp2 start || exit $? +setup_net start || exit $? +init_bpf_fs_vars || exit $? +setup_bpf_cgrp2_array start || exit $? +do_test +echo diff --git a/samples/bpf/test_cgrp2_tc_kern.c b/samples/bpf/test_cgrp2_tc_kern.c new file mode 100644 index 000000000000..2732c37c8d5b --- /dev/null +++ b/samples/bpf/test_cgrp2_tc_kern.c @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <uapi/linux/if_ether.h> +#include <uapi/linux/in6.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/pkt_cls.h> +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +/* copy of 'struct ethhdr' without __packed */ +struct eth_hdr { + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; + unsigned short h_proto; +}; + +#define PIN_GLOBAL_NS 2 +struct bpf_elf_map { + __u32 type; + __u32 size_key; + __u32 size_value; + __u32 max_elem; + __u32 flags; + __u32 id; + __u32 pinning; +}; + +struct bpf_elf_map SEC("maps") test_cgrp2_array_pin = { + .type = BPF_MAP_TYPE_CGROUP_ARRAY, + .size_key = sizeof(uint32_t), + .size_value = sizeof(uint32_t), + .pinning = PIN_GLOBAL_NS, + .max_elem = 1, +}; + +SEC("filter") +int handle_egress(struct __sk_buff *skb) +{ + void *data = (void *)(long)skb->data; + struct eth_hdr *eth = data; + struct ipv6hdr *ip6h = data + sizeof(*eth); + void *data_end = (void *)(long)skb->data_end; + char dont_care_msg[] = "dont care %04x %d\n"; + char pass_msg[] = "pass\n"; + char reject_msg[] = "reject\n"; + + /* single length check */ + if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) + return TC_ACT_OK; + + if (eth->h_proto != htons(ETH_P_IPV6) || + ip6h->nexthdr != IPPROTO_ICMPV6) { + bpf_trace_printk(dont_care_msg, sizeof(dont_care_msg), + eth->h_proto, ip6h->nexthdr); + return TC_ACT_OK; + } else if (bpf_skb_in_cgroup(skb, &test_cgrp2_array_pin, 0) != 1) { + bpf_trace_printk(pass_msg, sizeof(pass_msg)); + return TC_ACT_OK; + } else { + bpf_trace_printk(reject_msg, sizeof(reject_msg)); + return TC_ACT_SHOT; + } +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_cls_bpf.sh b/samples/bpf/test_cls_bpf.sh new file mode 100755 index 000000000000..0365d5ee512c --- /dev/null +++ b/samples/bpf/test_cls_bpf.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +function pktgen { + ../pktgen/pktgen_bench_xmit_mode_netif_receive.sh -i $IFC -s 64 \ + -m 90:e2:ba:ff:ff:ff -d 192.168.0.1 -t 4 + local dropped=`tc -s qdisc show dev $IFC | tail -3 | awk '/drop/{print $7}'` + if [ "$dropped" == "0," ]; then + echo "FAIL" + else + echo "Successfully filtered " $dropped " packets" + fi +} + +function test { + echo -n "Loading bpf program '$2'... " + tc qdisc add dev $IFC clsact + tc filter add dev $IFC ingress bpf da obj $1 sec $2 + local status=$? + if [ $status -ne 0 ]; then + echo "FAIL" + else + echo "ok" + pktgen + fi + tc qdisc del dev $IFC clsact +} + +IFC=test_veth + +ip link add name $IFC type veth peer name pair_$IFC +ip link set $IFC up +ip link set pair_$IFC up + +test ./parse_simple.o simple +test ./parse_varlen.o varlen +test ./parse_ldabs.o ldabs +ip link del dev $IFC diff --git a/samples/bpf/test_overhead_kprobe_kern.c b/samples/bpf/test_overhead_kprobe_kern.c new file mode 100644 index 000000000000..468a66a92ef9 --- /dev/null +++ b/samples/bpf/test_overhead_kprobe_kern.c @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/version.h> +#include <linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;}) + +SEC("kprobe/__set_task_comm") +int prog(struct pt_regs *ctx) +{ + struct signal_struct *signal; + struct task_struct *tsk; + char oldcomm[16] = {}; + char newcomm[16] = {}; + u16 oom_score_adj; + u32 pid; + + tsk = (void *)PT_REGS_PARM1(ctx); + + pid = _(tsk->pid); + bpf_probe_read(oldcomm, sizeof(oldcomm), &tsk->comm); + bpf_probe_read(newcomm, sizeof(newcomm), (void *)PT_REGS_PARM2(ctx)); + signal = _(tsk->signal); + oom_score_adj = _(signal->oom_score_adj); + return 0; +} + +SEC("kprobe/urandom_read") +int prog2(struct pt_regs *ctx) +{ + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_overhead_tp_kern.c b/samples/bpf/test_overhead_tp_kern.c new file mode 100644 index 000000000000..38f5c0b9da9f --- /dev/null +++ b/samples/bpf/test_overhead_tp_kern.c @@ -0,0 +1,36 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +/* from /sys/kernel/debug/tracing/events/task/task_rename/format */ +struct task_rename { + __u64 pad; + __u32 pid; + char oldcomm[16]; + char newcomm[16]; + __u16 oom_score_adj; +}; +SEC("tracepoint/task/task_rename") +int prog(struct task_rename *ctx) +{ + return 0; +} + +/* from /sys/kernel/debug/tracing/events/random/urandom_read/format */ +struct urandom_read { + __u64 pad; + int got_bits; + int pool_left; + int input_left; +}; +SEC("tracepoint/random/urandom_read") +int prog2(struct urandom_read *ctx) +{ + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c new file mode 100644 index 000000000000..d291167fd3c7 --- /dev/null +++ b/samples/bpf/test_overhead_user.c @@ -0,0 +1,162 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define _GNU_SOURCE +#include <sched.h> +#include <stdio.h> +#include <sys/types.h> +#include <asm/unistd.h> +#include <fcntl.h> +#include <unistd.h> +#include <assert.h> +#include <sys/wait.h> +#include <stdlib.h> +#include <signal.h> +#include <linux/bpf.h> +#include <string.h> +#include <time.h> +#include <sys/resource.h> +#include "libbpf.h" +#include "bpf_load.h" + +#define MAX_CNT 1000000 + +static __u64 time_get_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000ull + ts.tv_nsec; +} + +static void test_task_rename(int cpu) +{ + __u64 start_time; + char buf[] = "test\n"; + int i, fd; + + fd = open("/proc/self/comm", O_WRONLY|O_TRUNC); + if (fd < 0) { + printf("couldn't open /proc\n"); + exit(1); + } + start_time = time_get_ns(); + for (i = 0; i < MAX_CNT; i++) + write(fd, buf, sizeof(buf)); + printf("task_rename:%d: %lld events per sec\n", + cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); + close(fd); +} + +static void test_urandom_read(int cpu) +{ + __u64 start_time; + char buf[4]; + int i, fd; + + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + printf("couldn't open /dev/urandom\n"); + exit(1); + } + start_time = time_get_ns(); + for (i = 0; i < MAX_CNT; i++) + read(fd, buf, sizeof(buf)); + printf("urandom_read:%d: %lld events per sec\n", + cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); + close(fd); +} + +static void loop(int cpu, int flags) +{ + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + sched_setaffinity(0, sizeof(cpuset), &cpuset); + + if (flags & 1) + test_task_rename(cpu); + if (flags & 2) + test_urandom_read(cpu); +} + +static void run_perf_test(int tasks, int flags) +{ + pid_t pid[tasks]; + int i; + + for (i = 0; i < tasks; i++) { + pid[i] = fork(); + if (pid[i] == 0) { + loop(i, flags); + exit(0); + } else if (pid[i] == -1) { + printf("couldn't spawn #%d process\n", i); + exit(1); + } + } + for (i = 0; i < tasks; i++) { + int status; + + assert(waitpid(pid[i], &status, 0) == pid[i]); + assert(status == 0); + } +} + +static void unload_progs(void) +{ + close(prog_fd[0]); + close(prog_fd[1]); + close(event_fd[0]); + close(event_fd[1]); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char filename[256]; + int num_cpu = 8; + int test_flags = ~0; + + setrlimit(RLIMIT_MEMLOCK, &r); + + if (argc > 1) + test_flags = atoi(argv[1]) ? : test_flags; + if (argc > 2) + num_cpu = atoi(argv[2]) ? : num_cpu; + + if (test_flags & 0x3) { + printf("BASE\n"); + run_perf_test(num_cpu, test_flags); + } + + if (test_flags & 0xC) { + snprintf(filename, sizeof(filename), + "%s_kprobe_kern.o", argv[0]); + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + printf("w/KPROBE\n"); + run_perf_test(num_cpu, test_flags >> 2); + unload_progs(); + } + + if (test_flags & 0x30) { + snprintf(filename, sizeof(filename), + "%s_tp_kern.o", argv[0]); + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + printf("w/TRACEPOINT\n"); + run_perf_test(num_cpu, test_flags >> 4); + unload_progs(); + } + + return 0; +} diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user_kern.c new file mode 100644 index 000000000000..3a677c807044 --- /dev/null +++ b/samples/bpf/test_probe_write_user_kern.c @@ -0,0 +1,52 @@ +/* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <uapi/linux/bpf.h> +#include <linux/version.h> +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") dnat_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(struct sockaddr_in), + .value_size = sizeof(struct sockaddr_in), + .max_entries = 256, +}; + +/* kprobe is NOT a stable ABI + * kernel functions can be removed, renamed or completely change semantics. + * Number of arguments and their positions can change, etc. + * In such case this bpf+kprobe example will no longer be meaningful + * + * This example sits on a syscall, and the syscall ABI is relatively stable + * of course, across platforms, and over time, the ABI may change. + */ +SEC("kprobe/sys_connect") +int bpf_prog1(struct pt_regs *ctx) +{ + struct sockaddr_in new_addr, orig_addr = {}; + struct sockaddr_in *mapped_addr; + void *sockaddr_arg = (void *)PT_REGS_PARM2(ctx); + int sockaddr_len = (int)PT_REGS_PARM3(ctx); + + if (sockaddr_len > sizeof(orig_addr)) + return 0; + + if (bpf_probe_read(&orig_addr, sizeof(orig_addr), sockaddr_arg) != 0) + return 0; + + mapped_addr = bpf_map_lookup_elem(&dnat_map, &orig_addr); + if (mapped_addr != NULL) { + memcpy(&new_addr, mapped_addr, sizeof(new_addr)); + bpf_probe_write_user(sockaddr_arg, &new_addr, + sizeof(new_addr)); + } + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_probe_write_user_user.c b/samples/bpf/test_probe_write_user_user.c new file mode 100644 index 000000000000..a44bf347bedd --- /dev/null +++ b/samples/bpf/test_probe_write_user_user.c @@ -0,0 +1,78 @@ +#include <stdio.h> +#include <assert.h> +#include <linux/bpf.h> +#include <unistd.h> +#include "libbpf.h" +#include "bpf_load.h" +#include <sys/socket.h> +#include <string.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +int main(int ac, char **argv) +{ + int serverfd, serverconnfd, clientfd; + socklen_t sockaddr_len; + struct sockaddr serv_addr, mapped_addr, tmp_addr; + struct sockaddr_in *serv_addr_in, *mapped_addr_in, *tmp_addr_in; + char filename[256]; + char *ip; + + serv_addr_in = (struct sockaddr_in *)&serv_addr; + mapped_addr_in = (struct sockaddr_in *)&mapped_addr; + tmp_addr_in = (struct sockaddr_in *)&tmp_addr; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + assert((serverfd = socket(AF_INET, SOCK_STREAM, 0)) > 0); + assert((clientfd = socket(AF_INET, SOCK_STREAM, 0)) > 0); + + /* Bind server to ephemeral port on lo */ + memset(&serv_addr, 0, sizeof(serv_addr)); + serv_addr_in->sin_family = AF_INET; + serv_addr_in->sin_port = 0; + serv_addr_in->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + + assert(bind(serverfd, &serv_addr, sizeof(serv_addr)) == 0); + + sockaddr_len = sizeof(serv_addr); + assert(getsockname(serverfd, &serv_addr, &sockaddr_len) == 0); + ip = inet_ntoa(serv_addr_in->sin_addr); + printf("Server bound to: %s:%d\n", ip, ntohs(serv_addr_in->sin_port)); + + memset(&mapped_addr, 0, sizeof(mapped_addr)); + mapped_addr_in->sin_family = AF_INET; + mapped_addr_in->sin_port = htons(5555); + mapped_addr_in->sin_addr.s_addr = inet_addr("255.255.255.255"); + + assert(!bpf_update_elem(map_fd[0], &mapped_addr, &serv_addr, BPF_ANY)); + + assert(listen(serverfd, 5) == 0); + + ip = inet_ntoa(mapped_addr_in->sin_addr); + printf("Client connecting to: %s:%d\n", + ip, ntohs(mapped_addr_in->sin_port)); + assert(connect(clientfd, &mapped_addr, sizeof(mapped_addr)) == 0); + + sockaddr_len = sizeof(tmp_addr); + ip = inet_ntoa(tmp_addr_in->sin_addr); + assert((serverconnfd = accept(serverfd, &tmp_addr, &sockaddr_len)) > 0); + printf("Server received connection from: %s:%d\n", + ip, ntohs(tmp_addr_in->sin_port)); + + sockaddr_len = sizeof(tmp_addr); + assert(getpeername(clientfd, &tmp_addr, &sockaddr_len) == 0); + ip = inet_ntoa(tmp_addr_in->sin_addr); + printf("Client's peer address: %s:%d\n", + ip, ntohs(tmp_addr_in->sin_port)); + + /* Is the server's getsockname = the socket getpeername */ + assert(memcmp(&serv_addr, &tmp_addr, sizeof(struct sockaddr_in)) == 0); + + return 0; +} diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c index 4b51a9039c0d..fe2fcec98c1f 100644 --- a/samples/bpf/test_verifier.c +++ b/samples/bpf/test_verifier.c @@ -309,6 +309,19 @@ static struct bpf_test tests[] = { .result_unpriv = REJECT, }, { + "check valid spill/fill, skb mark", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, + offsetof(struct __sk_buff, mark)), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .result_unpriv = ACCEPT, + }, + { "check corrupted spill/fill", .insns = { /* spill R1(ctx) into stack */ @@ -1180,6 +1193,341 @@ static struct bpf_test tests[] = { .result_unpriv = REJECT, .result = ACCEPT, }, + { + "raw_stack: no skb_load_bytes", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_4, 8), + /* Call to skb_load_bytes() omitted. */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid read from stack off -8+0 size 8", + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "raw_stack: skb_load_bytes, no init", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_4, 8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "raw_stack: skb_load_bytes, init", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_6, 0, 0xcafe), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_4, 8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "raw_stack: skb_load_bytes, spilled regs around bounds", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -16), + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, -8), /* spill ctx from R1 */ + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 8), /* spill ctx from R1 */ + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_4, 8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, -8), /* fill ctx into R0 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_6, 8), /* fill ctx into R2 */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, + offsetof(struct __sk_buff, mark)), + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_2, + offsetof(struct __sk_buff, priority)), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "raw_stack: skb_load_bytes, spilled regs corruption", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8), + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), /* spill ctx from R1 */ + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_4, 8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), /* fill ctx into R0 */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, + offsetof(struct __sk_buff, mark)), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 invalid mem access 'inv'", + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "raw_stack: skb_load_bytes, spilled regs corruption 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -16), + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, -8), /* spill ctx from R1 */ + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), /* spill ctx from R1 */ + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 8), /* spill ctx from R1 */ + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_4, 8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, -8), /* fill ctx into R0 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_6, 8), /* fill ctx into R2 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_6, 0), /* fill ctx into R3 */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, + offsetof(struct __sk_buff, mark)), + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_2, + offsetof(struct __sk_buff, priority)), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_3, + offsetof(struct __sk_buff, pkt_type)), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_3), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R3 invalid mem access 'inv'", + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "raw_stack: skb_load_bytes, spilled regs + data", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -16), + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, -8), /* spill ctx from R1 */ + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), /* spill ctx from R1 */ + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 8), /* spill ctx from R1 */ + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_4, 8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, -8), /* fill ctx into R0 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_6, 8), /* fill ctx into R2 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_6, 0), /* fill data into R3 */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, + offsetof(struct __sk_buff, mark)), + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_2, + offsetof(struct __sk_buff, priority)), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_3), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "raw_stack: skb_load_bytes, invalid access 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -513), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_4, 8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid stack type R3 off=-513 access_size=8", + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "raw_stack: skb_load_bytes, invalid access 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -1), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_4, 8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid stack type R3 off=-1 access_size=8", + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "raw_stack: skb_load_bytes, invalid access 3", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 0xffffffff), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_4, 0xffffffff), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid stack type R3 off=-1 access_size=-1", + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "raw_stack: skb_load_bytes, invalid access 4", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -1), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_4, 0x7fffffff), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid stack type R3 off=-1 access_size=2147483647", + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "raw_stack: skb_load_bytes, invalid access 5", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -512), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_4, 0x7fffffff), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid stack type R3 off=-512 access_size=2147483647", + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "raw_stack: skb_load_bytes, invalid access 6", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -512), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid stack type R3 off=-512 access_size=0", + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "raw_stack: skb_load_bytes, large access", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 4), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -512), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_4, 512), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "pkt: test1", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "pkt: test2", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_MOV64_REG(BPF_REG_5, BPF_REG_3), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14), + BPF_JMP_REG(BPF_JGT, BPF_REG_5, BPF_REG_4, 15), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_3, 7), + BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_3, 12), + BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 14), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_4), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 48), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 48), + BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_3), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_3, 4), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "pkt: test3", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access off=76", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, + }, + { + "pkt: test4", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1), + BPF_STX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "cannot write", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, }; static int probe_filter_length(struct bpf_insn *fp) diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output_kern.c index 8d8d1ec429eb..9b96f4fb8cea 100644 --- a/samples/bpf/trace_output_kern.c +++ b/samples/bpf/trace_output_kern.c @@ -18,7 +18,6 @@ int bpf_prog1(struct pt_regs *ctx) u64 cookie; } data; - memset(&data, 0, sizeof(data)); data.pid = bpf_get_current_pid_tgid(); data.cookie = 0x12345678; diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c index 3f450a8fa1f3..107da148820f 100644 --- a/samples/bpf/tracex1_kern.c +++ b/samples/bpf/tracex1_kern.c @@ -23,16 +23,14 @@ int bpf_prog1(struct pt_regs *ctx) /* attaches to kprobe netif_receive_skb, * looks for packets on loobpack device and prints them */ - char devname[IFNAMSIZ] = {}; + char devname[IFNAMSIZ]; struct net_device *dev; struct sk_buff *skb; int len; /* non-portable! works for the given kernel only */ skb = (struct sk_buff *) PT_REGS_PARM1(ctx); - dev = _(skb->dev); - len = _(skb->len); bpf_probe_read(devname, sizeof(devname), dev->name); diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c index 09c1adc27d42..5e11c20ce5ec 100644 --- a/samples/bpf/tracex2_kern.c +++ b/samples/bpf/tracex2_kern.c @@ -27,10 +27,10 @@ int bpf_prog2(struct pt_regs *ctx) long init_val = 1; long *value; - /* x64/s390x specific: read ip of kfree_skb caller. + /* read ip of kfree_skb caller. * non-portable version of __builtin_return_address(0) */ - bpf_probe_read(&loc, sizeof(loc), (void *)PT_REGS_RET(ctx)); + BPF_KPROBE_READ_RET_IP(loc, ctx); value = bpf_map_lookup_elem(&my_map, &loc); if (value) @@ -66,7 +66,7 @@ struct hist_key { char comm[16]; u64 pid_tgid; u64 uid_gid; - u32 index; + u64 index; }; struct bpf_map_def SEC("maps") my_hist_map = { @@ -82,7 +82,7 @@ int bpf_prog3(struct pt_regs *ctx) long write_size = PT_REGS_PARM3(ctx); long init_val = 1; long *value; - struct hist_key key = {}; + struct hist_key key; key.index = log2l(write_size); key.pid_tgid = bpf_get_current_pid_tgid(); diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c index ac4671420cf1..6dd8e384de96 100644 --- a/samples/bpf/tracex4_kern.c +++ b/samples/bpf/tracex4_kern.c @@ -40,7 +40,7 @@ int bpf_prog2(struct pt_regs *ctx) long ip = 0; /* get ip address of kmem_cache_alloc_node() caller */ - bpf_probe_read(&ip, sizeof(ip), (void *)(PT_REGS_FP(ctx) + sizeof(ip))); + BPF_KRETPROBE_READ_RET_IP(ip, ctx); struct pair v = { .val = bpf_ktime_get_ns(), diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c index b3f4295bf288..f95f232cbab9 100644 --- a/samples/bpf/tracex5_kern.c +++ b/samples/bpf/tracex5_kern.c @@ -22,7 +22,7 @@ struct bpf_map_def SEC("maps") progs = { SEC("kprobe/seccomp_phase1") int bpf_prog1(struct pt_regs *ctx) { - struct seccomp_data sd = {}; + struct seccomp_data sd; bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM1(ctx)); @@ -40,7 +40,7 @@ int bpf_prog1(struct pt_regs *ctx) /* we jump here when syscall number == __NR_write */ PROG(__NR_write)(struct pt_regs *ctx) { - struct seccomp_data sd = {}; + struct seccomp_data sd; bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM1(ctx)); if (sd.args[2] == 512) { @@ -53,7 +53,7 @@ PROG(__NR_write)(struct pt_regs *ctx) PROG(__NR_read)(struct pt_regs *ctx) { - struct seccomp_data sd = {}; + struct seccomp_data sd; bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM1(ctx)); if (sd.args[2] > 128 && sd.args[2] <= 1024) { diff --git a/samples/bpf/xdp1_kern.c b/samples/bpf/xdp1_kern.c new file mode 100644 index 000000000000..219742106bfd --- /dev/null +++ b/samples/bpf/xdp1_kern.c @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PLUMgrid + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") rxcnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = 256, +}; + +static int parse_ipv4(void *data, u64 nh_off, void *data_end) +{ + struct iphdr *iph = data + nh_off; + + if (iph + 1 > data_end) + return 0; + return iph->protocol; +} + +static int parse_ipv6(void *data, u64 nh_off, void *data_end) +{ + struct ipv6hdr *ip6h = data + nh_off; + + if (ip6h + 1 > data_end) + return 0; + return ip6h->nexthdr; +} + +SEC("xdp1") +int xdp_prog1(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + int rc = XDP_DROP; + long *value; + u16 h_proto; + u64 nh_off; + u32 ipproto; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return rc; + + h_proto = eth->h_proto; + + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return rc; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return rc; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + + if (h_proto == htons(ETH_P_IP)) + ipproto = parse_ipv4(data, nh_off, data_end); + else if (h_proto == htons(ETH_P_IPV6)) + ipproto = parse_ipv6(data, nh_off, data_end); + else + ipproto = 0; + + value = bpf_map_lookup_elem(&rxcnt, &ipproto); + if (value) + *value += 1; + + return rc; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c new file mode 100644 index 000000000000..a5e109e398a1 --- /dev/null +++ b/samples/bpf/xdp1_user.c @@ -0,0 +1,181 @@ +/* Copyright (c) 2016 PLUMgrid + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/bpf.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <unistd.h> +#include "bpf_load.h" +#include "libbpf.h" + +static int set_link_xdp_fd(int ifindex, int fd) +{ + struct sockaddr_nl sa; + int sock, seq = 0, len, ret = -1; + char buf[4096]; + struct nlattr *nla, *nla_xdp; + struct { + struct nlmsghdr nh; + struct ifinfomsg ifinfo; + char attrbuf[64]; + } req; + struct nlmsghdr *nh; + struct nlmsgerr *err; + + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + goto cleanup; + } + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_SETLINK; + req.nh.nlmsg_pid = 0; + req.nh.nlmsg_seq = ++seq; + req.ifinfo.ifi_family = AF_UNSPEC; + req.ifinfo.ifi_index = ifindex; + nla = (struct nlattr *)(((char *)&req) + + NLMSG_ALIGN(req.nh.nlmsg_len)); + nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/; + + nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN); + nla_xdp->nla_type = 1/*IFLA_XDP_FD*/; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); + memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); + nla->nla_len = NLA_HDRLEN + nla_xdp->nla_len; + + req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); + + if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { + printf("send to netlink: %s\n", strerror(errno)); + goto cleanup; + } + + len = recv(sock, buf, sizeof(buf), 0); + if (len < 0) { + printf("recv from netlink: %s\n", strerror(errno)); + goto cleanup; + } + + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); + nh = NLMSG_NEXT(nh, len)) { + if (nh->nlmsg_pid != getpid()) { + printf("Wrong pid %d, expected %d\n", + nh->nlmsg_pid, getpid()); + goto cleanup; + } + if (nh->nlmsg_seq != seq) { + printf("Wrong seq %d, expected %d\n", + nh->nlmsg_seq, seq); + goto cleanup; + } + switch (nh->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(nh); + if (!err->error) + continue; + printf("nlmsg error %s\n", strerror(-err->error)); + goto cleanup; + case NLMSG_DONE: + break; + } + } + + ret = 0; + +cleanup: + close(sock); + return ret; +} + +static int ifindex; + +static void int_exit(int sig) +{ + set_link_xdp_fd(ifindex, -1); + exit(0); +} + +/* simple per-protocol drop counter + */ +static void poll_stats(int interval) +{ + unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + const unsigned int nr_keys = 256; + __u64 values[nr_cpus], prev[nr_keys][nr_cpus]; + __u32 key; + int i; + + memset(prev, 0, sizeof(prev)); + + while (1) { + sleep(interval); + + for (key = 0; key < nr_keys; key++) { + __u64 sum = 0; + + assert(bpf_lookup_elem(map_fd[0], &key, values) == 0); + for (i = 0; i < nr_cpus; i++) + sum += (values[i] - prev[key][i]); + if (sum) + printf("proto %u: %10llu pkt/s\n", + key, sum / interval); + memcpy(prev[key], values, sizeof(values)); + } + } +} + +int main(int ac, char **argv) +{ + char filename[256]; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (ac != 2) { + printf("usage: %s IFINDEX\n", argv[0]); + return 1; + } + + ifindex = strtoul(argv[1], NULL, 0); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + if (!prog_fd[0]) { + printf("load_bpf_file: %s\n", strerror(errno)); + return 1; + } + + signal(SIGINT, int_exit); + + if (set_link_xdp_fd(ifindex, prog_fd[0]) < 0) { + printf("link set xdp fd failed\n"); + return 1; + } + + poll_stats(2); + + return 0; +} diff --git a/samples/bpf/xdp2_kern.c b/samples/bpf/xdp2_kern.c new file mode 100644 index 000000000000..e01288867d15 --- /dev/null +++ b/samples/bpf/xdp2_kern.c @@ -0,0 +1,114 @@ +/* Copyright (c) 2016 PLUMgrid + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") rxcnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = 256, +}; + +static void swap_src_dst_mac(void *data) +{ + unsigned short *p = data; + unsigned short dst[3]; + + dst[0] = p[0]; + dst[1] = p[1]; + dst[2] = p[2]; + p[0] = p[3]; + p[1] = p[4]; + p[2] = p[5]; + p[3] = dst[0]; + p[4] = dst[1]; + p[5] = dst[2]; +} + +static int parse_ipv4(void *data, u64 nh_off, void *data_end) +{ + struct iphdr *iph = data + nh_off; + + if (iph + 1 > data_end) + return 0; + return iph->protocol; +} + +static int parse_ipv6(void *data, u64 nh_off, void *data_end) +{ + struct ipv6hdr *ip6h = data + nh_off; + + if (ip6h + 1 > data_end) + return 0; + return ip6h->nexthdr; +} + +SEC("xdp1") +int xdp_prog1(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + int rc = XDP_DROP; + long *value; + u16 h_proto; + u64 nh_off; + u32 ipproto; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return rc; + + h_proto = eth->h_proto; + + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return rc; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return rc; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + + if (h_proto == htons(ETH_P_IP)) + ipproto = parse_ipv4(data, nh_off, data_end); + else if (h_proto == htons(ETH_P_IPV6)) + ipproto = parse_ipv6(data, nh_off, data_end); + else + ipproto = 0; + + value = bpf_map_lookup_elem(&rxcnt, &ipproto); + if (value) + *value += 1; + + if (ipproto == IPPROTO_UDP) { + swap_src_dst_mac(data); + rc = XDP_TX; + } + + return rc; +} + +char _license[] SEC("license") = "GPL"; |